imdb get info
This commit is contained in:
@@ -1,81 +1,57 @@
|
||||
from couchpotato.core.event import addEvent
|
||||
from couchpotato.core.logger import CPLog
|
||||
from couchpotato.core.providers.movie.base import MovieProvider
|
||||
from imdb import IMDb
|
||||
from imdb import IMDb, helpers
|
||||
from imdb._logging import setLevel
|
||||
import time
|
||||
|
||||
log = CPLog(__name__)
|
||||
|
||||
|
||||
class IMDB(MovieProvider):
|
||||
|
||||
info_list = ('main', 'plot', 'release dates', 'taglines', 'synopsis')
|
||||
|
||||
def __init__(self):
|
||||
|
||||
#addEvent('movie.search', self.search)
|
||||
#addEvent('movie.info', self.getInfo)
|
||||
|
||||
self.p = IMDb('http')
|
||||
setLevel('warn')
|
||||
|
||||
def search(self):
|
||||
print 'search'
|
||||
|
||||
def conf(self, option):
|
||||
return self.config.get('IMDB', option)
|
||||
|
||||
def find(self, q, limit = 8, alternative = True):
|
||||
''' Find movie by name '''
|
||||
|
||||
log.info('IMDB - Searching for movie: %s' % q)
|
||||
def search(self, q, limit = 12):
|
||||
|
||||
r = self.p.search_movie(q)
|
||||
|
||||
return self.toResults(r, limit)
|
||||
|
||||
def toResults(self, r, limit = 8, one = False):
|
||||
results = []
|
||||
|
||||
if one:
|
||||
new = self.feedItem()
|
||||
new.imdb = 'tt' + r.movieID
|
||||
new.name = self.toSaveString(r['title'])
|
||||
try:
|
||||
new.year = r['year']
|
||||
except:
|
||||
new.year = ''
|
||||
|
||||
return new
|
||||
else :
|
||||
nr = 0
|
||||
for movie in r:
|
||||
results.append(self.toResults(movie, one = True))
|
||||
nr += 1
|
||||
if nr == limit:
|
||||
break
|
||||
|
||||
return results
|
||||
|
||||
def findById(self, id):
|
||||
''' Find movie by TheMovieDB ID '''
|
||||
print '==' * 80
|
||||
|
||||
return []
|
||||
|
||||
def getInfo(self, identifier = None):
|
||||
|
||||
def findByImdbId(self, id, details = False):
|
||||
''' Find movie by IMDB ID '''
|
||||
m = self.p.get_movie(identifier.replace('tt', ''), info = self.info_list)
|
||||
|
||||
log.info('IMDB - Searching for movie: %s' % str(id))
|
||||
poster = m['cover url']
|
||||
poster_original = helpers.fullSizeCoverURL(m)
|
||||
|
||||
r = self.p.get_movie(id.replace('tt', ''))
|
||||
movie_data = {
|
||||
'id': identifier,
|
||||
'titles': [m['title']],
|
||||
'original_title': m['title'],
|
||||
'rating': {
|
||||
'imdb': (m.get('rating'), m.get('votes')),
|
||||
},
|
||||
'images': {
|
||||
'poster': [poster] if poster else [],
|
||||
'poster_original': [poster_original] if poster_original else [],
|
||||
},
|
||||
'imdb': identifier,
|
||||
'runtime': m.get('runtime')[0].split(':')[1],
|
||||
'released': m.get('release dates')[0].split('::')[1],
|
||||
'year': m['year'],
|
||||
'plot': m.get('synopsis', ''),
|
||||
'tagline': m.get('taglines', '')[0],
|
||||
'genres': m.get('genres', []),
|
||||
}
|
||||
|
||||
if not details:
|
||||
return self.toResults(r, one = True)
|
||||
else:
|
||||
self.p.update(r)
|
||||
self.p.update(r, info = 'release dates')
|
||||
self.p.update(r, info = 'taglines')
|
||||
return r
|
||||
|
||||
def get_IMDb_instance(self):
|
||||
return IMDb('http')
|
||||
|
||||
|
||||
def findReleaseDate(self, movie):
|
||||
pass
|
||||
return movie_data
|
||||
|
||||
@@ -1,197 +0,0 @@
|
||||
"""
|
||||
Character module (imdb package).
|
||||
|
||||
This module provides the Character class, used to store information about
|
||||
a given character.
|
||||
|
||||
Copyright 2007-2010 Davide Alberani <da@erlug.linux.it>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
"""
|
||||
|
||||
from copy import deepcopy
|
||||
|
||||
from imdb.utils import analyze_name, build_name, flatten, _Container, cmpPeople
|
||||
|
||||
|
||||
class Character(_Container):
|
||||
"""A Character.
|
||||
|
||||
Every information about a character can be accessed as:
|
||||
characterObject['information']
|
||||
to get a list of the kind of information stored in a
|
||||
Character object, use the keys() method; some useful aliases
|
||||
are defined (as "also known as" for the "akas" key);
|
||||
see the keys_alias dictionary.
|
||||
"""
|
||||
# The default sets of information retrieved.
|
||||
default_info = ('main', 'filmography', 'biography')
|
||||
|
||||
# Aliases for some not-so-intuitive keys.
|
||||
keys_alias = {'mini biography': 'biography',
|
||||
'bio': 'biography',
|
||||
'character biography': 'biography',
|
||||
'character biographies': 'biography',
|
||||
'biographies': 'biography',
|
||||
'character bio': 'biography',
|
||||
'aka': 'akas',
|
||||
'also known as': 'akas',
|
||||
'alternate names': 'akas',
|
||||
'personal quotes': 'quotes',
|
||||
'keys': 'keywords',
|
||||
'keyword': 'keywords'}
|
||||
|
||||
keys_tomodify_list = ('biography', 'quotes')
|
||||
|
||||
cmpFunct = cmpPeople
|
||||
|
||||
def _init(self, **kwds):
|
||||
"""Initialize a Character object.
|
||||
|
||||
*characterID* -- the unique identifier for the character.
|
||||
*name* -- the name of the Character, if not in the data dictionary.
|
||||
*myName* -- the nickname you use for this character.
|
||||
*myID* -- your personal id for this character.
|
||||
*data* -- a dictionary used to initialize the object.
|
||||
*notes* -- notes about the given character.
|
||||
*accessSystem* -- a string representing the data access system used.
|
||||
*titlesRefs* -- a dictionary with references to movies.
|
||||
*namesRefs* -- a dictionary with references to persons.
|
||||
*charactersRefs* -- a dictionary with references to characters.
|
||||
*modFunct* -- function called returning text fields.
|
||||
"""
|
||||
name = kwds.get('name')
|
||||
if name and not self.data.has_key('name'):
|
||||
self.set_name(name)
|
||||
self.characterID = kwds.get('characterID', None)
|
||||
self.myName = kwds.get('myName', u'')
|
||||
|
||||
def _reset(self):
|
||||
"""Reset the Character object."""
|
||||
self.characterID = None
|
||||
self.myName = u''
|
||||
|
||||
def set_name(self, name):
|
||||
"""Set the name of the character."""
|
||||
# XXX: convert name to unicode, if it's a plain string?
|
||||
d = analyze_name(name, canonical=0)
|
||||
self.data.update(d)
|
||||
|
||||
def _additional_keys(self):
|
||||
"""Valid keys to append to the data.keys() list."""
|
||||
addkeys = []
|
||||
if self.data.has_key('name'):
|
||||
addkeys += ['long imdb name']
|
||||
if self.data.has_key('headshot'):
|
||||
addkeys += ['full-size headshot']
|
||||
return addkeys
|
||||
|
||||
def _getitem(self, key):
|
||||
"""Handle special keys."""
|
||||
## XXX: can a character have an imdbIndex?
|
||||
if self.data.has_key('name'):
|
||||
if key == 'long imdb name':
|
||||
return build_name(self.data)
|
||||
if key == 'full-size headshot' and self.data.has_key('headshot'):
|
||||
return self._re_fullsizeURL.sub('', self.data.get('headshot', ''))
|
||||
return None
|
||||
|
||||
def getID(self):
|
||||
"""Return the characterID."""
|
||||
return self.characterID
|
||||
|
||||
def __nonzero__(self):
|
||||
"""The Character is "false" if the self.data does not contain a name."""
|
||||
# XXX: check the name and the characterID?
|
||||
if self.data.get('name'): return 1
|
||||
return 0
|
||||
|
||||
def __contains__(self, item):
|
||||
"""Return true if this Character was portrayed in the given Movie
|
||||
or it was impersonated by the given Person."""
|
||||
from Movie import Movie
|
||||
from Person import Person
|
||||
if isinstance(item, Person):
|
||||
for m in flatten(self.data, yieldDictKeys=1, scalar=Movie):
|
||||
if item.isSame(m.currentRole):
|
||||
return 1
|
||||
elif isinstance(item, Movie):
|
||||
for m in flatten(self.data, yieldDictKeys=1, scalar=Movie):
|
||||
if item.isSame(m):
|
||||
return 1
|
||||
return 0
|
||||
|
||||
def isSameName(self, other):
|
||||
"""Return true if two character have the same name
|
||||
and/or characterID."""
|
||||
if not isinstance(other, self.__class__):
|
||||
return 0
|
||||
if self.data.has_key('name') and \
|
||||
other.data.has_key('name') and \
|
||||
build_name(self.data, canonical=0) == \
|
||||
build_name(other.data, canonical=0):
|
||||
return 1
|
||||
if self.accessSystem == other.accessSystem and \
|
||||
self.characterID is not None and \
|
||||
self.characterID == other.characterID:
|
||||
return 1
|
||||
return 0
|
||||
isSameCharacter = isSameName
|
||||
|
||||
def __deepcopy__(self, memo):
|
||||
"""Return a deep copy of a Character instance."""
|
||||
c = Character(name=u'', characterID=self.characterID,
|
||||
myName=self.myName, myID=self.myID,
|
||||
data=deepcopy(self.data, memo),
|
||||
notes=self.notes, accessSystem=self.accessSystem,
|
||||
titlesRefs=deepcopy(self.titlesRefs, memo),
|
||||
namesRefs=deepcopy(self.namesRefs, memo),
|
||||
charactersRefs=deepcopy(self.charactersRefs, memo))
|
||||
c.current_info = list(self.current_info)
|
||||
c.set_mod_funct(self.modFunct)
|
||||
return c
|
||||
|
||||
def __repr__(self):
|
||||
"""String representation of a Character object."""
|
||||
r = '<Character id:%s[%s] name:_%s_>' % (self.characterID,
|
||||
self.accessSystem,
|
||||
self.get('name'))
|
||||
if isinstance(r, unicode): r = r.encode('utf_8', 'replace')
|
||||
return r
|
||||
|
||||
def __str__(self):
|
||||
"""Simply print the short name."""
|
||||
return self.get('name', u'').encode('utf_8', 'replace')
|
||||
|
||||
def __unicode__(self):
|
||||
"""Simply print the short title."""
|
||||
return self.get('name', u'')
|
||||
|
||||
def summary(self):
|
||||
"""Return a string with a pretty-printed summary for the character."""
|
||||
if not self: return u''
|
||||
s = u'Character\n=====\nName: %s\n' % \
|
||||
self.get('name', u'')
|
||||
bio = self.get('biography')
|
||||
if bio:
|
||||
s += u'Biography: %s\n' % bio[0]
|
||||
filmo = self.get('filmography')
|
||||
if filmo:
|
||||
a_list = [x.get('long imdb canonical title', u'')
|
||||
for x in filmo[:5]]
|
||||
s += u'Last movies with this character: %s.\n' % u'; '.join(a_list)
|
||||
return s
|
||||
|
||||
|
||||
@@ -1,195 +0,0 @@
|
||||
"""
|
||||
company module (imdb package).
|
||||
|
||||
This module provides the company class, used to store information about
|
||||
a given company.
|
||||
|
||||
Copyright 2008-2009 Davide Alberani <da@erlug.linux.it>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
"""
|
||||
|
||||
from copy import deepcopy
|
||||
|
||||
from imdb.utils import analyze_company_name, build_company_name, \
|
||||
flatten, _Container, cmpCompanies
|
||||
|
||||
|
||||
class Company(_Container):
|
||||
"""A company.
|
||||
|
||||
Every information about a company can be accessed as:
|
||||
companyObject['information']
|
||||
to get a list of the kind of information stored in a
|
||||
company object, use the keys() method; some useful aliases
|
||||
are defined (as "also known as" for the "akas" key);
|
||||
see the keys_alias dictionary.
|
||||
"""
|
||||
# The default sets of information retrieved.
|
||||
default_info = ('main',)
|
||||
|
||||
# Aliases for some not-so-intuitive keys.
|
||||
keys_alias = {
|
||||
'distributor': 'distributors',
|
||||
'special effects company': 'special effects companies',
|
||||
'other company': 'miscellaneous companies',
|
||||
'miscellaneous company': 'miscellaneous companies',
|
||||
'other companies': 'miscellaneous companies',
|
||||
'misc companies': 'miscellaneous companies',
|
||||
'misc company': 'miscellaneous companies',
|
||||
'production company': 'production companies'}
|
||||
|
||||
keys_tomodify_list = ()
|
||||
|
||||
cmpFunct = cmpCompanies
|
||||
|
||||
def _init(self, **kwds):
|
||||
"""Initialize a company object.
|
||||
|
||||
*companyID* -- the unique identifier for the company.
|
||||
*name* -- the name of the company, if not in the data dictionary.
|
||||
*myName* -- the nickname you use for this company.
|
||||
*myID* -- your personal id for this company.
|
||||
*data* -- a dictionary used to initialize the object.
|
||||
*notes* -- notes about the given company.
|
||||
*accessSystem* -- a string representing the data access system used.
|
||||
*titlesRefs* -- a dictionary with references to movies.
|
||||
*namesRefs* -- a dictionary with references to persons.
|
||||
*charactersRefs* -- a dictionary with references to companies.
|
||||
*modFunct* -- function called returning text fields.
|
||||
"""
|
||||
name = kwds.get('name')
|
||||
if name and not self.data.has_key('name'):
|
||||
self.set_name(name)
|
||||
self.companyID = kwds.get('companyID', None)
|
||||
self.myName = kwds.get('myName', u'')
|
||||
|
||||
def _reset(self):
|
||||
"""Reset the company object."""
|
||||
self.companyID = None
|
||||
self.myName = u''
|
||||
|
||||
def set_name(self, name):
|
||||
"""Set the name of the company."""
|
||||
# XXX: convert name to unicode, if it's a plain string?
|
||||
# Company diverges a bit from other classes, being able
|
||||
# to directly handle its "notes". AND THAT'S PROBABLY A BAD IDEA!
|
||||
oname = name = name.strip()
|
||||
notes = u''
|
||||
if name.endswith(')'):
|
||||
fparidx = name.find('(')
|
||||
if fparidx != -1:
|
||||
notes = name[fparidx:]
|
||||
name = name[:fparidx].rstrip()
|
||||
if self.notes:
|
||||
name = oname
|
||||
d = analyze_company_name(name)
|
||||
self.data.update(d)
|
||||
if notes and not self.notes:
|
||||
self.notes = notes
|
||||
|
||||
def _additional_keys(self):
|
||||
"""Valid keys to append to the data.keys() list."""
|
||||
if self.data.has_key('name'):
|
||||
return ['long imdb name']
|
||||
return []
|
||||
|
||||
def _getitem(self, key):
|
||||
"""Handle special keys."""
|
||||
## XXX: can a company have an imdbIndex?
|
||||
if self.data.has_key('name'):
|
||||
if key == 'long imdb name':
|
||||
return build_company_name(self.data)
|
||||
return None
|
||||
|
||||
def getID(self):
|
||||
"""Return the companyID."""
|
||||
return self.companyID
|
||||
|
||||
def __nonzero__(self):
|
||||
"""The company is "false" if the self.data does not contain a name."""
|
||||
# XXX: check the name and the companyID?
|
||||
if self.data.get('name'): return 1
|
||||
return 0
|
||||
|
||||
def __contains__(self, item):
|
||||
"""Return true if this company and the given Movie are related."""
|
||||
from Movie import Movie
|
||||
if isinstance(item, Movie):
|
||||
for m in flatten(self.data, yieldDictKeys=1, scalar=Movie):
|
||||
if item.isSame(m):
|
||||
return 1
|
||||
return 0
|
||||
|
||||
def isSameName(self, other):
|
||||
"""Return true if two company have the same name
|
||||
and/or companyID."""
|
||||
if not isinstance(other, self.__class__):
|
||||
return 0
|
||||
if self.data.has_key('name') and \
|
||||
other.data.has_key('name') and \
|
||||
build_company_name(self.data) == \
|
||||
build_company_name(other.data):
|
||||
return 1
|
||||
if self.accessSystem == other.accessSystem and \
|
||||
self.companyID is not None and \
|
||||
self.companyID == other.companyID:
|
||||
return 1
|
||||
return 0
|
||||
isSameCompany = isSameName
|
||||
|
||||
def __deepcopy__(self, memo):
|
||||
"""Return a deep copy of a company instance."""
|
||||
c = Company(name=u'', companyID=self.companyID,
|
||||
myName=self.myName, myID=self.myID,
|
||||
data=deepcopy(self.data, memo),
|
||||
notes=self.notes, accessSystem=self.accessSystem,
|
||||
titlesRefs=deepcopy(self.titlesRefs, memo),
|
||||
namesRefs=deepcopy(self.namesRefs, memo),
|
||||
charactersRefs=deepcopy(self.charactersRefs, memo))
|
||||
c.current_info = list(self.current_info)
|
||||
c.set_mod_funct(self.modFunct)
|
||||
return c
|
||||
|
||||
def __repr__(self):
|
||||
"""String representation of a Company object."""
|
||||
r = '<Company id:%s[%s] name:_%s_>' % (self.companyID,
|
||||
self.accessSystem,
|
||||
self.get('long imdb name'))
|
||||
if isinstance(r, unicode): r = r.encode('utf_8', 'replace')
|
||||
return r
|
||||
|
||||
def __str__(self):
|
||||
"""Simply print the short name."""
|
||||
return self.get('name', u'').encode('utf_8', 'replace')
|
||||
|
||||
def __unicode__(self):
|
||||
"""Simply print the short title."""
|
||||
return self.get('name', u'')
|
||||
|
||||
def summary(self):
|
||||
"""Return a string with a pretty-printed summary for the company."""
|
||||
if not self: return u''
|
||||
s = u'Company\n=======\nName: %s\n' % \
|
||||
self.get('name', u'')
|
||||
for k in ('distributor', 'production company', 'miscellaneous company',
|
||||
'special effects company'):
|
||||
d = self.get(k, [])[:5]
|
||||
if not d: continue
|
||||
s += u'Last movies from this company (%s): %s.\n' % \
|
||||
(k, u'; '.join([x.get('long imdb title', u'') for x in d]))
|
||||
return s
|
||||
|
||||
|
||||
@@ -1,398 +0,0 @@
|
||||
"""
|
||||
Movie module (imdb package).
|
||||
|
||||
This module provides the Movie class, used to store information about
|
||||
a given movie.
|
||||
|
||||
Copyright 2004-2010 Davide Alberani <da@erlug.linux.it>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
"""
|
||||
|
||||
from copy import deepcopy
|
||||
|
||||
from imdb import articles
|
||||
from imdb.utils import analyze_title, build_title, canonicalTitle, \
|
||||
flatten, _Container, cmpMovies
|
||||
|
||||
|
||||
class Movie(_Container):
|
||||
"""A Movie.
|
||||
|
||||
Every information about a movie can be accessed as:
|
||||
movieObject['information']
|
||||
to get a list of the kind of information stored in a
|
||||
Movie object, use the keys() method; some useful aliases
|
||||
are defined (as "casting" for the "casting director" key); see
|
||||
the keys_alias dictionary.
|
||||
"""
|
||||
# The default sets of information retrieved.
|
||||
default_info = ('main', 'plot')
|
||||
|
||||
# Aliases for some not-so-intuitive keys.
|
||||
keys_alias = {
|
||||
'tv schedule': 'airing',
|
||||
'user rating': 'rating',
|
||||
'plot summary': 'plot',
|
||||
'plot summaries': 'plot',
|
||||
'directed by': 'director',
|
||||
'created by': 'creator',
|
||||
'writing credits': 'writer',
|
||||
'produced by': 'producer',
|
||||
'original music by': 'original music',
|
||||
'non-original music by': 'non-original music',
|
||||
'music': 'original music',
|
||||
'cinematography by': 'cinematographer',
|
||||
'cinematography': 'cinematographer',
|
||||
'film editing by': 'editor',
|
||||
'film editing': 'editor',
|
||||
'editing': 'editor',
|
||||
'actors': 'cast',
|
||||
'actresses': 'cast',
|
||||
'casting by': 'casting director',
|
||||
'casting': 'casting director',
|
||||
'art direction by': 'art direction',
|
||||
'set decoration by': 'set decoration',
|
||||
'costume design by': 'costume designer',
|
||||
'costume design': 'costume designer',
|
||||
'makeup department': 'make up',
|
||||
'makeup': 'make up',
|
||||
'make-up': 'make up',
|
||||
'production management': 'production manager',
|
||||
'production company': 'production companies',
|
||||
'second unit director or assistant director':
|
||||
'assistant director',
|
||||
'second unit director': 'assistant director',
|
||||
'sound department': 'sound crew',
|
||||
'costume and wardrobe department': 'costume department',
|
||||
'special effects by': 'special effects',
|
||||
'visual effects by': 'visual effects',
|
||||
'special effects company': 'special effects companies',
|
||||
'stunts': 'stunt performer',
|
||||
'other crew': 'miscellaneous crew',
|
||||
'misc crew': 'miscellaneous crew',
|
||||
'miscellaneouscrew': 'miscellaneous crew',
|
||||
'crewmembers': 'miscellaneous crew',
|
||||
'crew members': 'miscellaneous crew',
|
||||
'other companies': 'miscellaneous companies',
|
||||
'misc companies': 'miscellaneous companies',
|
||||
'miscellaneous company': 'miscellaneous companies',
|
||||
'misc company': 'miscellaneous companies',
|
||||
'other company': 'miscellaneous companies',
|
||||
'aka': 'akas',
|
||||
'also known as': 'akas',
|
||||
'country': 'countries',
|
||||
'production country': 'countries',
|
||||
'production countries': 'countries',
|
||||
'genre': 'genres',
|
||||
'runtime': 'runtimes',
|
||||
'lang': 'languages',
|
||||
'color': 'color info',
|
||||
'cover': 'cover url',
|
||||
'full-size cover': 'full-size cover url',
|
||||
'seasons': 'number of seasons',
|
||||
'language': 'languages',
|
||||
'certificate': 'certificates',
|
||||
'certifications': 'certificates',
|
||||
'certification': 'certificates',
|
||||
'miscellaneous links': 'misc links',
|
||||
'miscellaneous': 'misc links',
|
||||
'soundclips': 'sound clips',
|
||||
'videoclips': 'video clips',
|
||||
'photographs': 'photo sites',
|
||||
'distributor': 'distributors',
|
||||
'distribution': 'distributors',
|
||||
'distribution companies': 'distributors',
|
||||
'distribution company': 'distributors',
|
||||
'guest': 'guests',
|
||||
'guest appearances': 'guests',
|
||||
'tv guests': 'guests',
|
||||
'notable tv guest appearances': 'guests',
|
||||
'episodes cast': 'guests',
|
||||
'episodes number': 'number of episodes',
|
||||
'amazon review': 'amazon reviews',
|
||||
'merchandising': 'merchandising links',
|
||||
'merchandise': 'merchandising links',
|
||||
'sales': 'merchandising links',
|
||||
'faq': 'faqs',
|
||||
'parental guide': 'parents guide',
|
||||
'frequently asked questions': 'faqs'}
|
||||
|
||||
keys_tomodify_list = ('plot', 'trivia', 'alternate versions', 'goofs',
|
||||
'quotes', 'dvd', 'laserdisc', 'news', 'soundtrack',
|
||||
'crazy credits', 'business', 'supplements',
|
||||
'video review', 'faqs')
|
||||
|
||||
cmpFunct = cmpMovies
|
||||
|
||||
def _init(self, **kwds):
|
||||
"""Initialize a Movie object.
|
||||
|
||||
*movieID* -- the unique identifier for the movie.
|
||||
*title* -- the title of the Movie, if not in the data dictionary.
|
||||
*myTitle* -- your personal title for the movie.
|
||||
*myID* -- your personal identifier for the movie.
|
||||
*data* -- a dictionary used to initialize the object.
|
||||
*currentRole* -- a Character instance representing the current role
|
||||
or duty of a person in this movie, or a Person
|
||||
object representing the actor/actress who played
|
||||
a given character in a Movie. If a string is
|
||||
passed, an object is automatically build.
|
||||
*roleID* -- if available, the characterID/personID of the currentRole
|
||||
object.
|
||||
*roleIsPerson* -- when False (default) the currentRole is assumed
|
||||
to be a Character object, otherwise a Person.
|
||||
*notes* -- notes for the person referred in the currentRole
|
||||
attribute; e.g.: '(voice)'.
|
||||
*accessSystem* -- a string representing the data access system used.
|
||||
*titlesRefs* -- a dictionary with references to movies.
|
||||
*namesRefs* -- a dictionary with references to persons.
|
||||
*charactersRefs* -- a dictionary with references to characters.
|
||||
*modFunct* -- function called returning text fields.
|
||||
"""
|
||||
title = kwds.get('title')
|
||||
if title and not self.data.has_key('title'):
|
||||
self.set_title(title)
|
||||
self.movieID = kwds.get('movieID', None)
|
||||
self.myTitle = kwds.get('myTitle', u'')
|
||||
|
||||
def _reset(self):
|
||||
"""Reset the Movie object."""
|
||||
self.movieID = None
|
||||
self.myTitle = u''
|
||||
|
||||
def set_title(self, title):
|
||||
"""Set the title of the movie."""
|
||||
# XXX: convert title to unicode, if it's a plain string?
|
||||
d_title = analyze_title(title)
|
||||
self.data.update(d_title)
|
||||
|
||||
def _additional_keys(self):
|
||||
"""Valid keys to append to the data.keys() list."""
|
||||
addkeys = []
|
||||
if self.data.has_key('title'):
|
||||
addkeys += ['canonical title', 'long imdb title',
|
||||
'long imdb canonical title',
|
||||
'smart canonical title',
|
||||
'smart long imdb canonical title']
|
||||
if self.data.has_key('episode of'):
|
||||
addkeys += ['long imdb episode title', 'series title',
|
||||
'canonical series title', 'episode title',
|
||||
'canonical episode title',
|
||||
'smart canonical series title',
|
||||
'smart canonical episode title']
|
||||
if self.data.has_key('cover url'):
|
||||
addkeys += ['full-size cover url']
|
||||
return addkeys
|
||||
|
||||
def guessLanguage(self):
|
||||
"""Guess the language of the title of this movie; returns None
|
||||
if there are no hints."""
|
||||
lang = self.get('languages')
|
||||
if lang:
|
||||
lang = lang[0]
|
||||
else:
|
||||
country = self.get('countries')
|
||||
if country:
|
||||
lang = articles.COUNTRY_LANG.get(country[0])
|
||||
return lang
|
||||
|
||||
def smartCanonicalTitle(self, title=None, lang=None):
|
||||
"""Return the canonical title, guessing its language.
|
||||
The title can be forces with the 'title' argument (internally
|
||||
used) and the language can be forced with the 'lang' argument,
|
||||
otherwise it's auto-detected."""
|
||||
if title is None:
|
||||
title = self.data.get('title', u'')
|
||||
if lang is None:
|
||||
lang = self.guessLanguage()
|
||||
return canonicalTitle(title, lang=lang)
|
||||
|
||||
def _getitem(self, key):
|
||||
"""Handle special keys."""
|
||||
if self.data.has_key('episode of'):
|
||||
if key == 'long imdb episode title':
|
||||
return build_title(self.data)
|
||||
elif key == 'series title':
|
||||
return self.data['episode of']['title']
|
||||
elif key == 'canonical series title':
|
||||
ser_title = self.data['episode of']['title']
|
||||
return canonicalTitle(ser_title)
|
||||
elif key == 'smart canonical series title':
|
||||
ser_title = self.data['episode of']['title']
|
||||
return self.smartCanonicalTitle(ser_title)
|
||||
elif key == 'episode title':
|
||||
return self.data.get('title', u'')
|
||||
elif key == 'canonical episode title':
|
||||
return canonicalTitle(self.data.get('title', u''))
|
||||
elif key == 'smart canonical episode title':
|
||||
return self.smartCanonicalTitle(self.data.get('title', u''))
|
||||
if self.data.has_key('title'):
|
||||
if key == 'title':
|
||||
return self.data['title']
|
||||
elif key == 'long imdb title':
|
||||
return build_title(self.data)
|
||||
elif key == 'canonical title':
|
||||
return canonicalTitle(self.data['title'])
|
||||
elif key == 'smart canonical title':
|
||||
return self.smartCanonicalTitle(self.data['title'])
|
||||
elif key == 'long imdb canonical title':
|
||||
return build_title(self.data, canonical=1)
|
||||
elif key == 'smart long imdb canonical title':
|
||||
return build_title(self.data, canonical=1,
|
||||
lang=self.guessLanguage())
|
||||
if key == 'full-size cover url' and self.data.has_key('cover url'):
|
||||
return self._re_fullsizeURL.sub('', self.data.get('cover url', ''))
|
||||
return None
|
||||
|
||||
def getID(self):
|
||||
"""Return the movieID."""
|
||||
return self.movieID
|
||||
|
||||
def __nonzero__(self):
|
||||
"""The Movie is "false" if the self.data does not contain a title."""
|
||||
# XXX: check the title and the movieID?
|
||||
if self.data.has_key('title'): return 1
|
||||
return 0
|
||||
|
||||
def isSameTitle(self, other):
|
||||
"""Return true if this and the compared object have the same
|
||||
long imdb title and/or movieID.
|
||||
"""
|
||||
# XXX: obsolete?
|
||||
if not isinstance(other, self.__class__): return 0
|
||||
if self.data.has_key('title') and \
|
||||
other.data.has_key('title') and \
|
||||
build_title(self.data, canonical=0) == \
|
||||
build_title(other.data, canonical=0):
|
||||
return 1
|
||||
if self.accessSystem == other.accessSystem and \
|
||||
self.movieID is not None and self.movieID == other.movieID:
|
||||
return 1
|
||||
return 0
|
||||
isSameMovie = isSameTitle # XXX: just for backward compatiblity.
|
||||
|
||||
def __contains__(self, item):
|
||||
"""Return true if the given Person object is listed in this Movie,
|
||||
or if the the given Character is represented in this Movie."""
|
||||
from Person import Person
|
||||
from Character import Character
|
||||
from Company import Company
|
||||
if isinstance(item, Person):
|
||||
for p in flatten(self.data, yieldDictKeys=1, scalar=Person,
|
||||
toDescend=(list, dict, tuple, Movie)):
|
||||
if item.isSame(p):
|
||||
return 1
|
||||
elif isinstance(item, Character):
|
||||
for p in flatten(self.data, yieldDictKeys=1, scalar=Person,
|
||||
toDescend=(list, dict, tuple, Movie)):
|
||||
if item.isSame(p.currentRole):
|
||||
return 1
|
||||
elif isinstance(item, Company):
|
||||
for c in flatten(self.data, yieldDictKeys=1, scalar=Company,
|
||||
toDescend=(list, dict, tuple, Movie)):
|
||||
if item.isSame(c):
|
||||
return 1
|
||||
return 0
|
||||
|
||||
def __deepcopy__(self, memo):
|
||||
"""Return a deep copy of a Movie instance."""
|
||||
m = Movie(title=u'', movieID=self.movieID, myTitle=self.myTitle,
|
||||
myID=self.myID, data=deepcopy(self.data, memo),
|
||||
currentRole=deepcopy(self.currentRole, memo),
|
||||
roleIsPerson=self._roleIsPerson,
|
||||
notes=self.notes, accessSystem=self.accessSystem,
|
||||
titlesRefs=deepcopy(self.titlesRefs, memo),
|
||||
namesRefs=deepcopy(self.namesRefs, memo),
|
||||
charactersRefs=deepcopy(self.charactersRefs, memo))
|
||||
m.current_info = list(self.current_info)
|
||||
m.set_mod_funct(self.modFunct)
|
||||
return m
|
||||
|
||||
def __repr__(self):
|
||||
"""String representation of a Movie object."""
|
||||
# XXX: add also currentRole and notes, if present?
|
||||
if self.has_key('long imdb episode title'):
|
||||
title = self.get('long imdb episode title')
|
||||
else:
|
||||
title = self.get('long imdb title')
|
||||
r = '<Movie id:%s[%s] title:_%s_>' % (self.movieID, self.accessSystem,
|
||||
title)
|
||||
if isinstance(r, unicode): r = r.encode('utf_8', 'replace')
|
||||
return r
|
||||
|
||||
def __str__(self):
|
||||
"""Simply print the short title."""
|
||||
return self.get('title', u'').encode('utf_8', 'replace')
|
||||
|
||||
def __unicode__(self):
|
||||
"""Simply print the short title."""
|
||||
return self.get('title', u'')
|
||||
|
||||
def summary(self):
|
||||
"""Return a string with a pretty-printed summary for the movie."""
|
||||
if not self: return u''
|
||||
def _nameAndRole(personList, joiner=u', '):
|
||||
"""Build a pretty string with name and role."""
|
||||
nl = []
|
||||
for person in personList:
|
||||
n = person.get('name', u'')
|
||||
if person.currentRole: n += u' (%s)' % person.currentRole
|
||||
nl.append(n)
|
||||
return joiner.join(nl)
|
||||
s = u'Movie\n=====\nTitle: %s\n' % \
|
||||
self.get('long imdb canonical title', u'')
|
||||
genres = self.get('genres')
|
||||
if genres: s += u'Genres: %s.\n' % u', '.join(genres)
|
||||
director = self.get('director')
|
||||
if director:
|
||||
s += u'Director: %s.\n' % _nameAndRole(director)
|
||||
writer = self.get('writer')
|
||||
if writer:
|
||||
s += u'Writer: %s.\n' % _nameAndRole(writer)
|
||||
cast = self.get('cast')
|
||||
if cast:
|
||||
cast = cast[:5]
|
||||
s += u'Cast: %s.\n' % _nameAndRole(cast)
|
||||
runtime = self.get('runtimes')
|
||||
if runtime:
|
||||
s += u'Runtime: %s.\n' % u', '.join(runtime)
|
||||
countries = self.get('countries')
|
||||
if countries:
|
||||
s += u'Country: %s.\n' % u', '.join(countries)
|
||||
lang = self.get('languages')
|
||||
if lang:
|
||||
s += u'Language: %s.\n' % u', '.join(lang)
|
||||
rating = self.get('rating')
|
||||
if rating:
|
||||
s += u'Rating: %s' % rating
|
||||
nr_votes = self.get('votes')
|
||||
if nr_votes:
|
||||
s += u' (%s votes)' % nr_votes
|
||||
s += u'.\n'
|
||||
plot = self.get('plot')
|
||||
if not plot:
|
||||
plot = self.get('plot summary')
|
||||
if plot:
|
||||
plot = [plot]
|
||||
if plot:
|
||||
plot = plot[0]
|
||||
i = plot.find('::')
|
||||
if i != -1:
|
||||
plot = plot[:i]
|
||||
s += u'Plot: %s' % plot
|
||||
return s
|
||||
|
||||
|
||||
@@ -1,275 +0,0 @@
|
||||
"""
|
||||
Person module (imdb package).
|
||||
|
||||
This module provides the Person class, used to store information about
|
||||
a given person.
|
||||
|
||||
Copyright 2004-2010 Davide Alberani <da@erlug.linux.it>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
"""
|
||||
|
||||
from copy import deepcopy
|
||||
|
||||
from imdb.utils import analyze_name, build_name, normalizeName, \
|
||||
flatten, _Container, cmpPeople
|
||||
|
||||
|
||||
class Person(_Container):
|
||||
"""A Person.
|
||||
|
||||
Every information about a person can be accessed as:
|
||||
personObject['information']
|
||||
to get a list of the kind of information stored in a
|
||||
Person object, use the keys() method; some useful aliases
|
||||
are defined (as "biography" for the "mini biography" key);
|
||||
see the keys_alias dictionary.
|
||||
"""
|
||||
# The default sets of information retrieved.
|
||||
default_info = ('main', 'filmography', 'biography')
|
||||
|
||||
# Aliases for some not-so-intuitive keys.
|
||||
keys_alias = {'biography': 'mini biography',
|
||||
'bio': 'mini biography',
|
||||
'aka': 'akas',
|
||||
'also known as': 'akas',
|
||||
'nick name': 'nick names',
|
||||
'nicks': 'nick names',
|
||||
'nickname': 'nick names',
|
||||
'miscellaneouscrew': 'miscellaneous crew',
|
||||
'crewmembers': 'miscellaneous crew',
|
||||
'misc': 'miscellaneous crew',
|
||||
'guest': 'notable tv guest appearances',
|
||||
'guests': 'notable tv guest appearances',
|
||||
'tv guest': 'notable tv guest appearances',
|
||||
'guest appearances': 'notable tv guest appearances',
|
||||
'spouses': 'spouse',
|
||||
'salary': 'salary history',
|
||||
'salaries': 'salary history',
|
||||
'otherworks': 'other works',
|
||||
"maltin's biography":
|
||||
"biography from leonard maltin's movie encyclopedia",
|
||||
"leonard maltin's biography":
|
||||
"biography from leonard maltin's movie encyclopedia",
|
||||
'real name': 'birth name',
|
||||
'where are they now': 'where now',
|
||||
'personal quotes': 'quotes',
|
||||
'mini-biography author': 'imdb mini-biography by',
|
||||
'biography author': 'imdb mini-biography by',
|
||||
'genre': 'genres',
|
||||
'portrayed': 'portrayed in',
|
||||
'keys': 'keywords',
|
||||
'trademarks': 'trade mark',
|
||||
'trade mark': 'trade mark',
|
||||
'trade marks': 'trade mark',
|
||||
'trademark': 'trade mark',
|
||||
'pictorials': 'pictorial',
|
||||
'magazine covers': 'magazine cover photo',
|
||||
'magazine-covers': 'magazine cover photo',
|
||||
'tv series episodes': 'episodes',
|
||||
'tv-series episodes': 'episodes',
|
||||
'articles': 'article',
|
||||
'keyword': 'keywords'}
|
||||
|
||||
# 'nick names'???
|
||||
keys_tomodify_list = ('mini biography', 'spouse', 'quotes', 'other works',
|
||||
'salary history', 'trivia', 'trade mark', 'news',
|
||||
'books', 'biographical movies', 'portrayed in',
|
||||
'where now', 'interviews', 'article',
|
||||
"biography from leonard maltin's movie encyclopedia")
|
||||
|
||||
cmpFunct = cmpPeople
|
||||
|
||||
def _init(self, **kwds):
|
||||
"""Initialize a Person object.
|
||||
|
||||
*personID* -- the unique identifier for the person.
|
||||
*name* -- the name of the Person, if not in the data dictionary.
|
||||
*myName* -- the nickname you use for this person.
|
||||
*myID* -- your personal id for this person.
|
||||
*data* -- a dictionary used to initialize the object.
|
||||
*currentRole* -- a Character instance representing the current role
|
||||
or duty of a person in this movie, or a Person
|
||||
object representing the actor/actress who played
|
||||
a given character in a Movie. If a string is
|
||||
passed, an object is automatically build.
|
||||
*roleID* -- if available, the characterID/personID of the currentRole
|
||||
object.
|
||||
*roleIsPerson* -- when False (default) the currentRole is assumed
|
||||
to be a Character object, otherwise a Person.
|
||||
*notes* -- notes about the given person for a specific movie
|
||||
or role (e.g.: the alias used in the movie credits).
|
||||
*accessSystem* -- a string representing the data access system used.
|
||||
*titlesRefs* -- a dictionary with references to movies.
|
||||
*namesRefs* -- a dictionary with references to persons.
|
||||
*modFunct* -- function called returning text fields.
|
||||
*billingPos* -- position of this person in the credits list.
|
||||
"""
|
||||
name = kwds.get('name')
|
||||
if name and not self.data.has_key('name'):
|
||||
self.set_name(name)
|
||||
self.personID = kwds.get('personID', None)
|
||||
self.myName = kwds.get('myName', u'')
|
||||
self.billingPos = kwds.get('billingPos', None)
|
||||
|
||||
def _reset(self):
|
||||
"""Reset the Person object."""
|
||||
self.personID = None
|
||||
self.myName = u''
|
||||
self.billingPos = None
|
||||
|
||||
def _clear(self):
|
||||
"""Reset the dictionary."""
|
||||
self.billingPos = None
|
||||
|
||||
def set_name(self, name):
|
||||
"""Set the name of the person."""
|
||||
# XXX: convert name to unicode, if it's a plain string?
|
||||
d = analyze_name(name, canonical=1)
|
||||
self.data.update(d)
|
||||
|
||||
def _additional_keys(self):
|
||||
"""Valid keys to append to the data.keys() list."""
|
||||
addkeys = []
|
||||
if self.data.has_key('name'):
|
||||
addkeys += ['canonical name', 'long imdb name',
|
||||
'long imdb canonical name']
|
||||
if self.data.has_key('headshot'):
|
||||
addkeys += ['full-size headshot']
|
||||
return addkeys
|
||||
|
||||
def _getitem(self, key):
|
||||
"""Handle special keys."""
|
||||
if self.data.has_key('name'):
|
||||
if key == 'name':
|
||||
return normalizeName(self.data['name'])
|
||||
elif key == 'canonical name':
|
||||
return self.data['name']
|
||||
elif key == 'long imdb name':
|
||||
return build_name(self.data, canonical=0)
|
||||
elif key == 'long imdb canonical name':
|
||||
return build_name(self.data)
|
||||
if key == 'full-size headshot' and self.data.has_key('headshot'):
|
||||
return self._re_fullsizeURL.sub('', self.data.get('headshot', ''))
|
||||
return None
|
||||
|
||||
def getID(self):
|
||||
"""Return the personID."""
|
||||
return self.personID
|
||||
|
||||
def __nonzero__(self):
|
||||
"""The Person is "false" if the self.data does not contain a name."""
|
||||
# XXX: check the name and the personID?
|
||||
if self.data.has_key('name'): return 1
|
||||
return 0
|
||||
|
||||
def __contains__(self, item):
|
||||
"""Return true if this Person has worked in the given Movie,
|
||||
or if the fiven Character was played by this Person."""
|
||||
from Movie import Movie
|
||||
from Character import Character
|
||||
if isinstance(item, Movie):
|
||||
for m in flatten(self.data, yieldDictKeys=1, scalar=Movie):
|
||||
if item.isSame(m):
|
||||
return 1
|
||||
elif isinstance(item, Character):
|
||||
for m in flatten(self.data, yieldDictKeys=1, scalar=Movie):
|
||||
if item.isSame(m.currentRole):
|
||||
return 1
|
||||
return 0
|
||||
|
||||
def isSameName(self, other):
|
||||
"""Return true if two persons have the same name and imdbIndex
|
||||
and/or personID.
|
||||
"""
|
||||
if not isinstance(other, self.__class__):
|
||||
return 0
|
||||
if self.data.has_key('name') and \
|
||||
other.data.has_key('name') and \
|
||||
build_name(self.data, canonical=1) == \
|
||||
build_name(other.data, canonical=1):
|
||||
return 1
|
||||
if self.accessSystem == other.accessSystem and \
|
||||
self.personID and self.personID == other.personID:
|
||||
return 1
|
||||
return 0
|
||||
isSamePerson = isSameName # XXX: just for backward compatiblity.
|
||||
|
||||
def __deepcopy__(self, memo):
|
||||
"""Return a deep copy of a Person instance."""
|
||||
p = Person(name=u'', personID=self.personID, myName=self.myName,
|
||||
myID=self.myID, data=deepcopy(self.data, memo),
|
||||
currentRole=deepcopy(self.currentRole, memo),
|
||||
roleIsPerson=self._roleIsPerson,
|
||||
notes=self.notes, accessSystem=self.accessSystem,
|
||||
titlesRefs=deepcopy(self.titlesRefs, memo),
|
||||
namesRefs=deepcopy(self.namesRefs, memo),
|
||||
charactersRefs=deepcopy(self.charactersRefs, memo))
|
||||
p.current_info = list(self.current_info)
|
||||
p.set_mod_funct(self.modFunct)
|
||||
p.billingPos = self.billingPos
|
||||
return p
|
||||
|
||||
def __repr__(self):
|
||||
"""String representation of a Person object."""
|
||||
# XXX: add also currentRole and notes, if present?
|
||||
r = '<Person id:%s[%s] name:_%s_>' % (self.personID, self.accessSystem,
|
||||
self.get('long imdb canonical name'))
|
||||
if isinstance(r, unicode): r = r.encode('utf_8', 'replace')
|
||||
return r
|
||||
|
||||
def __str__(self):
|
||||
"""Simply print the short name."""
|
||||
return self.get('name', u'').encode('utf_8', 'replace')
|
||||
|
||||
def __unicode__(self):
|
||||
"""Simply print the short title."""
|
||||
return self.get('name', u'')
|
||||
|
||||
def summary(self):
|
||||
"""Return a string with a pretty-printed summary for the person."""
|
||||
if not self: return u''
|
||||
s = u'Person\n=====\nName: %s\n' % \
|
||||
self.get('long imdb canonical name', u'')
|
||||
bdate = self.get('birth date')
|
||||
if bdate:
|
||||
s += u'Birth date: %s' % bdate
|
||||
bnotes = self.get('birth notes')
|
||||
if bnotes:
|
||||
s += u' (%s)' % bnotes
|
||||
s += u'.\n'
|
||||
ddate = self.get('death date')
|
||||
if ddate:
|
||||
s += u'Death date: %s' % ddate
|
||||
dnotes = self.get('death notes')
|
||||
if dnotes:
|
||||
s += u' (%s)' % dnotes
|
||||
s += u'.\n'
|
||||
bio = self.get('mini biography')
|
||||
if bio:
|
||||
s += u'Biography: %s\n' % bio[0]
|
||||
director = self.get('director')
|
||||
if director:
|
||||
d_list = [x.get('long imdb canonical title', u'')
|
||||
for x in director[:3]]
|
||||
s += u'Last movies directed: %s.\n' % u'; '.join(d_list)
|
||||
act = self.get('actor') or self.get('actress')
|
||||
if act:
|
||||
a_list = [x.get('long imdb canonical title', u'')
|
||||
for x in act[:5]]
|
||||
s += u'Last movies acted: %s.\n' % u'; '.join(a_list)
|
||||
return s
|
||||
|
||||
|
||||
@@ -1,907 +0,0 @@
|
||||
"""
|
||||
imdb package.
|
||||
|
||||
This package can be used to retrieve information about a movie or
|
||||
a person from the IMDb database.
|
||||
It can fetch data through different media (e.g.: the IMDb web pages,
|
||||
a SQL database, etc.)
|
||||
|
||||
Copyright 2004-2010 Davide Alberani <da@erlug.linux.it>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
"""
|
||||
|
||||
__all__ = ['IMDb', 'IMDbError', 'Movie', 'Person', 'Character', 'Company',
|
||||
'available_access_systems']
|
||||
__version__ = VERSION = '4.8dev20110303'
|
||||
|
||||
# Import compatibility module (importing it is enough).
|
||||
import _compat
|
||||
|
||||
import sys, os, ConfigParser, logging
|
||||
from types import MethodType
|
||||
|
||||
from imdb import Movie, Person, Character, Company
|
||||
import imdb._logging
|
||||
from imdb._exceptions import IMDbError, IMDbDataAccessError
|
||||
from imdb.utils import build_title, build_name, build_company_name
|
||||
|
||||
_aux_logger = logging.getLogger('imdbpy.aux')
|
||||
|
||||
|
||||
# URLs of the main pages for movies, persons, characters and queries.
|
||||
imdbURL_base = 'http://akas.imdb.com/'
|
||||
# http://akas.imdb.com/title/
|
||||
imdbURL_movie_base = '%stitle/' % imdbURL_base
|
||||
# http://akas.imdb.com/title/tt%s/
|
||||
imdbURL_movie_main = imdbURL_movie_base + 'tt%s/'
|
||||
# http://akas.imdb.com/name/
|
||||
imdbURL_person_base = '%sname/' % imdbURL_base
|
||||
# http://akas.imdb.com/name/nm%s/
|
||||
imdbURL_person_main = imdbURL_person_base + 'nm%s/'
|
||||
# http://akas.imdb.com/character/
|
||||
imdbURL_character_base = '%scharacter/' % imdbURL_base
|
||||
# http://akas.imdb.com/character/ch%s/
|
||||
imdbURL_character_main = imdbURL_character_base + 'ch%s/'
|
||||
# http://akas.imdb.com/company/
|
||||
imdbURL_company_base = '%scompany/' % imdbURL_base
|
||||
# http://akas.imdb.com/company/co%s/
|
||||
imdbURL_company_main = imdbURL_company_base + 'co%s/'
|
||||
# http://akas.imdb.com/keyword/%s/
|
||||
imdbURL_keyword_main = imdbURL_base + 'keyword/%s/'
|
||||
# http://akas.imdb.com/chart/top
|
||||
imdbURL_top250 = imdbURL_base + 'chart/top'
|
||||
# http://akas.imdb.com/chart/bottom
|
||||
imdbURL_bottom100 = imdbURL_base + 'chart/bottom'
|
||||
# http://akas.imdb.com/find?%s
|
||||
imdbURL_find = imdbURL_base + 'find?%s'
|
||||
|
||||
# Name of the configuration file.
|
||||
confFileName = 'imdbpy.cfg'
|
||||
|
||||
class ConfigParserWithCase(ConfigParser.ConfigParser):
|
||||
"""A case-sensitive parser for configuration files."""
|
||||
def __init__(self, defaults=None, confFile=None, *args, **kwds):
|
||||
"""Initialize the parser.
|
||||
|
||||
*defaults* -- defaults values.
|
||||
*confFile* -- the file (or list of files) to parse."""
|
||||
ConfigParser.ConfigParser.__init__(self, defaults=defaults)
|
||||
if confFile is None:
|
||||
dotFileName = '.' + confFileName
|
||||
# Current and home directory.
|
||||
confFile = [os.path.join(os.getcwd(), confFileName),
|
||||
os.path.join(os.getcwd(), dotFileName),
|
||||
os.path.join(os.path.expanduser('~'), confFileName),
|
||||
os.path.join(os.path.expanduser('~'), dotFileName)]
|
||||
if os.name == 'posix':
|
||||
sep = getattr(os.path, 'sep', '/')
|
||||
# /etc/ and /etc/conf.d/
|
||||
confFile.append(os.path.join(sep, 'etc', confFileName))
|
||||
confFile.append(os.path.join(sep, 'etc', 'conf.d',
|
||||
confFileName))
|
||||
else:
|
||||
# etc subdirectory of sys.prefix, for non-unix systems.
|
||||
confFile.append(os.path.join(sys.prefix, 'etc', confFileName))
|
||||
for fname in confFile:
|
||||
try:
|
||||
self.read(fname)
|
||||
except (ConfigParser.MissingSectionHeaderError,
|
||||
ConfigParser.ParsingError), e:
|
||||
_aux_logger.warn('Troubles reading config file: %s' % e)
|
||||
# Stop at the first valid file.
|
||||
if self.has_section('imdbpy'):
|
||||
break
|
||||
|
||||
def optionxform(self, optionstr):
|
||||
"""Option names are case sensitive."""
|
||||
return optionstr
|
||||
|
||||
def _manageValue(self, value):
|
||||
"""Custom substitutions for values."""
|
||||
if not isinstance(value, (str, unicode)):
|
||||
return value
|
||||
vlower = value.lower()
|
||||
if vlower in self._boolean_states:
|
||||
return self._boolean_states[vlower]
|
||||
elif vlower == 'none':
|
||||
return None
|
||||
return value
|
||||
|
||||
def get(self, section, option, *args, **kwds):
|
||||
"""Return the value of an option from a given section."""
|
||||
value = ConfigParser.ConfigParser.get(self, section, option,
|
||||
*args, **kwds)
|
||||
return self._manageValue(value)
|
||||
|
||||
def items(self, section, *args, **kwds):
|
||||
"""Return a list of (key, value) tuples of items of the
|
||||
given section."""
|
||||
if section != 'DEFAULT' and not self.has_section(section):
|
||||
return []
|
||||
keys = ConfigParser.ConfigParser.options(self, section)
|
||||
return [(k, self.get(section, k, *args, **kwds)) for k in keys]
|
||||
|
||||
def getDict(self, section):
|
||||
"""Return a dictionary of items of the specified section."""
|
||||
return dict(self.items(section))
|
||||
|
||||
|
||||
def IMDb(accessSystem=None, *arguments, **keywords):
|
||||
"""Return an instance of the appropriate class.
|
||||
The accessSystem parameter is used to specify the kind of
|
||||
the preferred access system."""
|
||||
if accessSystem is None or accessSystem in ('auto', 'config'):
|
||||
try:
|
||||
cfg_file = ConfigParserWithCase(*arguments, **keywords)
|
||||
# Parameters set by the code take precedence.
|
||||
kwds = cfg_file.getDict('imdbpy')
|
||||
if 'accessSystem' in kwds:
|
||||
accessSystem = kwds['accessSystem']
|
||||
del kwds['accessSystem']
|
||||
else:
|
||||
accessSystem = 'http'
|
||||
kwds.update(keywords)
|
||||
keywords = kwds
|
||||
except Exception, e:
|
||||
import logging
|
||||
logging.getLogger('imdbpy').warn('Unable to read configuration' \
|
||||
' file; complete error: %s' % e)
|
||||
# It just LOOKS LIKE a bad habit: we tried to read config
|
||||
# options from some files, but something is gone horribly
|
||||
# wrong: ignore everything and pretend we were called with
|
||||
# the 'http' accessSystem.
|
||||
accessSystem = 'http'
|
||||
if 'loggingLevel' in keywords:
|
||||
imdb._logging.setLevel(keywords['loggingLevel'])
|
||||
del keywords['loggingLevel']
|
||||
if 'loggingConfig' in keywords:
|
||||
logCfg = keywords['loggingConfig']
|
||||
del keywords['loggingConfig']
|
||||
try:
|
||||
import logging.config
|
||||
logging.config.fileConfig(os.path.expanduser(logCfg))
|
||||
except Exception, e:
|
||||
logging.getLogger('imdbpy').warn('unable to read logger ' \
|
||||
'config: %s' % e)
|
||||
if accessSystem in ('http', 'web', 'html'):
|
||||
from parser.http import IMDbHTTPAccessSystem
|
||||
return IMDbHTTPAccessSystem(*arguments, **keywords)
|
||||
elif accessSystem in ('httpThin', 'webThin', 'htmlThin'):
|
||||
import logging
|
||||
logging.warn('httpThin is badly broken and' \
|
||||
' will not be fixed; please switch' \
|
||||
' to "http" or "mobile"')
|
||||
from parser.http import IMDbHTTPAccessSystem
|
||||
return IMDbHTTPAccessSystem(isThin=1, *arguments, **keywords)
|
||||
elif accessSystem in ('mobile',):
|
||||
from parser.mobile import IMDbMobileAccessSystem
|
||||
return IMDbMobileAccessSystem(*arguments, **keywords)
|
||||
elif accessSystem in ('local', 'files'):
|
||||
# The local access system was removed since IMDbPY 4.2.
|
||||
raise IMDbError, 'the local access system was removed since IMDbPY 4.2'
|
||||
elif accessSystem in ('sql', 'db', 'database'):
|
||||
try:
|
||||
from parser.sql import IMDbSqlAccessSystem
|
||||
except ImportError:
|
||||
raise IMDbError, 'the sql access system is not installed'
|
||||
return IMDbSqlAccessSystem(*arguments, **keywords)
|
||||
else:
|
||||
raise IMDbError, 'unknown kind of data access system: "%s"' \
|
||||
% accessSystem
|
||||
|
||||
|
||||
def available_access_systems():
|
||||
"""Return the list of available data access systems."""
|
||||
asList = []
|
||||
# XXX: trying to import modules is a good thing?
|
||||
try:
|
||||
from parser.http import IMDbHTTPAccessSystem
|
||||
asList += ['http', 'httpThin']
|
||||
except ImportError:
|
||||
pass
|
||||
try:
|
||||
from parser.mobile import IMDbMobileAccessSystem
|
||||
asList.append('mobile')
|
||||
except ImportError:
|
||||
pass
|
||||
try:
|
||||
from parser.sql import IMDbSqlAccessSystem
|
||||
asList.append('sql')
|
||||
except ImportError:
|
||||
pass
|
||||
return asList
|
||||
|
||||
|
||||
# XXX: I'm not sure this is a good guess.
|
||||
# I suppose that an argument of the IMDb function can be used to
|
||||
# set a default encoding for the output, and then Movie, Person and
|
||||
# Character objects can use this default encoding, returning strings.
|
||||
# Anyway, passing unicode strings to search_movie(), search_person()
|
||||
# and search_character() methods is always safer.
|
||||
encoding = getattr(sys.stdin, 'encoding', '') or sys.getdefaultencoding()
|
||||
|
||||
class IMDbBase:
|
||||
"""The base class used to search for a movie/person/character and
|
||||
to get a Movie/Person/Character object.
|
||||
|
||||
This class cannot directly fetch data of any kind and so you
|
||||
have to search the "real" code into a subclass."""
|
||||
|
||||
# The name of the preferred access system (MUST be overridden
|
||||
# in the subclasses).
|
||||
accessSystem = 'UNKNOWN'
|
||||
|
||||
# Top-level logger for IMDbPY.
|
||||
_imdb_logger = logging.getLogger('imdbpy')
|
||||
|
||||
def __init__(self, defaultModFunct=None, results=20, keywordsResults=100,
|
||||
*arguments, **keywords):
|
||||
"""Initialize the access system.
|
||||
If specified, defaultModFunct is the function used by
|
||||
default by the Person, Movie and Character objects, when
|
||||
accessing their text fields.
|
||||
"""
|
||||
# The function used to output the strings that need modification (the
|
||||
# ones containing references to movie titles and person names).
|
||||
self._defModFunct = defaultModFunct
|
||||
# Number of results to get.
|
||||
try:
|
||||
results = int(results)
|
||||
except (TypeError, ValueError):
|
||||
results = 20
|
||||
if results < 1:
|
||||
results = 20
|
||||
self._results = results
|
||||
try:
|
||||
keywordsResults = int(keywordsResults)
|
||||
except (TypeError, ValueError):
|
||||
keywordsResults = 100
|
||||
if keywordsResults < 1:
|
||||
keywordsResults = 100
|
||||
self._keywordsResults = keywordsResults
|
||||
|
||||
def _normalize_movieID(self, movieID):
|
||||
"""Normalize the given movieID."""
|
||||
# By default, do nothing.
|
||||
return movieID
|
||||
|
||||
def _normalize_personID(self, personID):
|
||||
"""Normalize the given personID."""
|
||||
# By default, do nothing.
|
||||
return personID
|
||||
|
||||
def _normalize_characterID(self, characterID):
|
||||
"""Normalize the given characterID."""
|
||||
# By default, do nothing.
|
||||
return characterID
|
||||
|
||||
def _normalize_companyID(self, companyID):
|
||||
"""Normalize the given companyID."""
|
||||
# By default, do nothing.
|
||||
return companyID
|
||||
|
||||
def _get_real_movieID(self, movieID):
|
||||
"""Handle title aliases."""
|
||||
# By default, do nothing.
|
||||
return movieID
|
||||
|
||||
def _get_real_personID(self, personID):
|
||||
"""Handle name aliases."""
|
||||
# By default, do nothing.
|
||||
return personID
|
||||
|
||||
def _get_real_characterID(self, characterID):
|
||||
"""Handle character name aliases."""
|
||||
# By default, do nothing.
|
||||
return characterID
|
||||
|
||||
def _get_real_companyID(self, companyID):
|
||||
"""Handle company name aliases."""
|
||||
# By default, do nothing.
|
||||
return companyID
|
||||
|
||||
def _get_infoset(self, prefname):
|
||||
"""Return methods with the name starting with prefname."""
|
||||
infoset = []
|
||||
excludes = ('%sinfoset' % prefname,)
|
||||
preflen = len(prefname)
|
||||
for name in dir(self.__class__):
|
||||
if name.startswith(prefname) and name not in excludes:
|
||||
member = getattr(self.__class__, name)
|
||||
if isinstance(member, MethodType):
|
||||
infoset.append(name[preflen:].replace('_', ' '))
|
||||
return infoset
|
||||
|
||||
def get_movie_infoset(self):
|
||||
"""Return the list of info set available for movies."""
|
||||
return self._get_infoset('get_movie_')
|
||||
|
||||
def get_person_infoset(self):
|
||||
"""Return the list of info set available for persons."""
|
||||
return self._get_infoset('get_person_')
|
||||
|
||||
def get_character_infoset(self):
|
||||
"""Return the list of info set available for characters."""
|
||||
return self._get_infoset('get_character_')
|
||||
|
||||
def get_company_infoset(self):
|
||||
"""Return the list of info set available for companies."""
|
||||
return self._get_infoset('get_company_')
|
||||
|
||||
def get_movie(self, movieID, info=Movie.Movie.default_info, modFunct=None):
|
||||
"""Return a Movie object for the given movieID.
|
||||
|
||||
The movieID is something used to univocally identify a movie;
|
||||
it can be the imdbID used by the IMDb web server, a file
|
||||
pointer, a line number in a file, an ID in a database, etc.
|
||||
|
||||
info is the list of sets of information to retrieve.
|
||||
|
||||
If specified, modFunct will be the function used by the Movie
|
||||
object when accessing its text fields (like 'plot')."""
|
||||
movieID = self._normalize_movieID(movieID)
|
||||
movieID = self._get_real_movieID(movieID)
|
||||
movie = Movie.Movie(movieID=movieID, accessSystem=self.accessSystem)
|
||||
modFunct = modFunct or self._defModFunct
|
||||
if modFunct is not None:
|
||||
movie.set_mod_funct(modFunct)
|
||||
self.update(movie, info)
|
||||
return movie
|
||||
|
||||
get_episode = get_movie
|
||||
|
||||
def _search_movie(self, title, results):
|
||||
"""Return a list of tuples (movieID, {movieData})"""
|
||||
# XXX: for the real implementation, see the method of the
|
||||
# subclass, somewhere under the imdb.parser package.
|
||||
raise NotImplementedError, 'override this method'
|
||||
|
||||
def search_movie(self, title, results=None, _episodes=False):
|
||||
"""Return a list of Movie objects for a query for the given title.
|
||||
The results argument is the maximum number of results to return."""
|
||||
if results is None:
|
||||
results = self._results
|
||||
try:
|
||||
results = int(results)
|
||||
except (ValueError, OverflowError):
|
||||
results = 20
|
||||
# XXX: I suppose it will be much safer if the user provides
|
||||
# an unicode string... this is just a guess.
|
||||
if not isinstance(title, unicode):
|
||||
title = unicode(title, encoding, 'replace')
|
||||
if not _episodes:
|
||||
res = self._search_movie(title, results)
|
||||
else:
|
||||
res = self._search_episode(title, results)
|
||||
return [Movie.Movie(movieID=self._get_real_movieID(mi),
|
||||
data=md, modFunct=self._defModFunct,
|
||||
accessSystem=self.accessSystem) for mi, md in res][:results]
|
||||
|
||||
def _search_episode(self, title, results):
|
||||
"""Return a list of tuples (movieID, {movieData})"""
|
||||
# XXX: for the real implementation, see the method of the
|
||||
# subclass, somewhere under the imdb.parser package.
|
||||
raise NotImplementedError, 'override this method'
|
||||
|
||||
def search_episode(self, title, results=None):
|
||||
"""Return a list of Movie objects for a query for the given title.
|
||||
The results argument is the maximum number of results to return;
|
||||
this method searches only for titles of tv (mini) series' episodes."""
|
||||
return self.search_movie(title, results=results, _episodes=True)
|
||||
|
||||
def get_person(self, personID, info=Person.Person.default_info,
|
||||
modFunct=None):
|
||||
"""Return a Person object for the given personID.
|
||||
|
||||
The personID is something used to univocally identify a person;
|
||||
it can be the imdbID used by the IMDb web server, a file
|
||||
pointer, a line number in a file, an ID in a database, etc.
|
||||
|
||||
info is the list of sets of information to retrieve.
|
||||
|
||||
If specified, modFunct will be the function used by the Person
|
||||
object when accessing its text fields (like 'mini biography')."""
|
||||
personID = self._normalize_personID(personID)
|
||||
personID = self._get_real_personID(personID)
|
||||
person = Person.Person(personID=personID,
|
||||
accessSystem=self.accessSystem)
|
||||
modFunct = modFunct or self._defModFunct
|
||||
if modFunct is not None:
|
||||
person.set_mod_funct(modFunct)
|
||||
self.update(person, info)
|
||||
return person
|
||||
|
||||
def _search_person(self, name, results):
|
||||
"""Return a list of tuples (personID, {personData})"""
|
||||
# XXX: for the real implementation, see the method of the
|
||||
# subclass, somewhere under the imdb.parser package.
|
||||
raise NotImplementedError, 'override this method'
|
||||
|
||||
def search_person(self, name, results=None):
|
||||
"""Return a list of Person objects for a query for the given name.
|
||||
|
||||
The results argument is the maximum number of results to return."""
|
||||
if results is None:
|
||||
results = self._results
|
||||
try:
|
||||
results = int(results)
|
||||
except (ValueError, OverflowError):
|
||||
results = 20
|
||||
if not isinstance(name, unicode):
|
||||
name = unicode(name, encoding, 'replace')
|
||||
res = self._search_person(name, results)
|
||||
return [Person.Person(personID=self._get_real_personID(pi),
|
||||
data=pd, modFunct=self._defModFunct,
|
||||
accessSystem=self.accessSystem) for pi, pd in res][:results]
|
||||
|
||||
def get_character(self, characterID, info=Character.Character.default_info,
|
||||
modFunct=None):
|
||||
"""Return a Character object for the given characterID.
|
||||
|
||||
The characterID is something used to univocally identify a character;
|
||||
it can be the imdbID used by the IMDb web server, a file
|
||||
pointer, a line number in a file, an ID in a database, etc.
|
||||
|
||||
info is the list of sets of information to retrieve.
|
||||
|
||||
If specified, modFunct will be the function used by the Character
|
||||
object when accessing its text fields (like 'biography')."""
|
||||
characterID = self._normalize_characterID(characterID)
|
||||
characterID = self._get_real_characterID(characterID)
|
||||
character = Character.Character(characterID=characterID,
|
||||
accessSystem=self.accessSystem)
|
||||
modFunct = modFunct or self._defModFunct
|
||||
if modFunct is not None:
|
||||
character.set_mod_funct(modFunct)
|
||||
self.update(character, info)
|
||||
return character
|
||||
|
||||
def _search_character(self, name, results):
|
||||
"""Return a list of tuples (characterID, {characterData})"""
|
||||
# XXX: for the real implementation, see the method of the
|
||||
# subclass, somewhere under the imdb.parser package.
|
||||
raise NotImplementedError, 'override this method'
|
||||
|
||||
def search_character(self, name, results=None):
|
||||
"""Return a list of Character objects for a query for the given name.
|
||||
|
||||
The results argument is the maximum number of results to return."""
|
||||
if results is None:
|
||||
results = self._results
|
||||
try:
|
||||
results = int(results)
|
||||
except (ValueError, OverflowError):
|
||||
results = 20
|
||||
if not isinstance(name, unicode):
|
||||
name = unicode(name, encoding, 'replace')
|
||||
res = self._search_character(name, results)
|
||||
return [Character.Character(characterID=self._get_real_characterID(pi),
|
||||
data=pd, modFunct=self._defModFunct,
|
||||
accessSystem=self.accessSystem) for pi, pd in res][:results]
|
||||
|
||||
def get_company(self, companyID, info=Company.Company.default_info,
|
||||
modFunct=None):
|
||||
"""Return a Company object for the given companyID.
|
||||
|
||||
The companyID is something used to univocally identify a company;
|
||||
it can be the imdbID used by the IMDb web server, a file
|
||||
pointer, a line number in a file, an ID in a database, etc.
|
||||
|
||||
info is the list of sets of information to retrieve.
|
||||
|
||||
If specified, modFunct will be the function used by the Company
|
||||
object when accessing its text fields (none, so far)."""
|
||||
companyID = self._normalize_companyID(companyID)
|
||||
companyID = self._get_real_companyID(companyID)
|
||||
company = Company.Company(companyID=companyID,
|
||||
accessSystem=self.accessSystem)
|
||||
modFunct = modFunct or self._defModFunct
|
||||
if modFunct is not None:
|
||||
company.set_mod_funct(modFunct)
|
||||
self.update(company, info)
|
||||
return company
|
||||
|
||||
def _search_company(self, name, results):
|
||||
"""Return a list of tuples (companyID, {companyData})"""
|
||||
# XXX: for the real implementation, see the method of the
|
||||
# subclass, somewhere under the imdb.parser package.
|
||||
raise NotImplementedError, 'override this method'
|
||||
|
||||
def search_company(self, name, results=None):
|
||||
"""Return a list of Company objects for a query for the given name.
|
||||
|
||||
The results argument is the maximum number of results to return."""
|
||||
if results is None:
|
||||
results = self._results
|
||||
try:
|
||||
results = int(results)
|
||||
except (ValueError, OverflowError):
|
||||
results = 20
|
||||
if not isinstance(name, unicode):
|
||||
name = unicode(name, encoding, 'replace')
|
||||
res = self._search_company(name, results)
|
||||
return [Company.Company(companyID=self._get_real_companyID(pi),
|
||||
data=pd, modFunct=self._defModFunct,
|
||||
accessSystem=self.accessSystem) for pi, pd in res][:results]
|
||||
|
||||
def _search_keyword(self, keyword, results):
|
||||
"""Return a list of 'keyword' strings."""
|
||||
# XXX: for the real implementation, see the method of the
|
||||
# subclass, somewhere under the imdb.parser package.
|
||||
raise NotImplementedError, 'override this method'
|
||||
|
||||
def search_keyword(self, keyword, results=None):
|
||||
"""Search for existing keywords, similar to the given one."""
|
||||
if results is None:
|
||||
results = self._keywordsResults
|
||||
try:
|
||||
results = int(results)
|
||||
except (ValueError, OverflowError):
|
||||
results = 100
|
||||
if not isinstance(keyword, unicode):
|
||||
keyword = unicode(keyword, encoding, 'replace')
|
||||
return self._search_keyword(keyword, results)
|
||||
|
||||
def _get_keyword(self, keyword, results):
|
||||
"""Return a list of tuples (movieID, {movieData})"""
|
||||
# XXX: for the real implementation, see the method of the
|
||||
# subclass, somewhere under the imdb.parser package.
|
||||
raise NotImplementedError, 'override this method'
|
||||
|
||||
def get_keyword(self, keyword, results=None):
|
||||
"""Return a list of movies for the given keyword."""
|
||||
if results is None:
|
||||
results = self._keywordsResults
|
||||
try:
|
||||
results = int(results)
|
||||
except (ValueError, OverflowError):
|
||||
results = 100
|
||||
# XXX: I suppose it will be much safer if the user provides
|
||||
# an unicode string... this is just a guess.
|
||||
if not isinstance(keyword, unicode):
|
||||
keyword = unicode(keyword, encoding, 'replace')
|
||||
res = self._get_keyword(keyword, results)
|
||||
return [Movie.Movie(movieID=self._get_real_movieID(mi),
|
||||
data=md, modFunct=self._defModFunct,
|
||||
accessSystem=self.accessSystem) for mi, md in res][:results]
|
||||
|
||||
def _get_top_bottom_movies(self, kind):
|
||||
"""Return the list of the top 250 or bottom 100 movies."""
|
||||
# XXX: for the real implementation, see the method of the
|
||||
# subclass, somewhere under the imdb.parser package.
|
||||
# This method must return a list of (movieID, {movieDict})
|
||||
# tuples. The kind parameter can be 'top' or 'bottom'.
|
||||
raise NotImplementedError, 'override this method'
|
||||
|
||||
def get_top250_movies(self):
|
||||
"""Return the list of the top 250 movies."""
|
||||
res = self._get_top_bottom_movies('top')
|
||||
return [Movie.Movie(movieID=self._get_real_movieID(mi),
|
||||
data=md, modFunct=self._defModFunct,
|
||||
accessSystem=self.accessSystem) for mi, md in res]
|
||||
|
||||
def get_bottom100_movies(self):
|
||||
"""Return the list of the bottom 100 movies."""
|
||||
res = self._get_top_bottom_movies('bottom')
|
||||
return [Movie.Movie(movieID=self._get_real_movieID(mi),
|
||||
data=md, modFunct=self._defModFunct,
|
||||
accessSystem=self.accessSystem) for mi, md in res]
|
||||
|
||||
def new_movie(self, *arguments, **keywords):
|
||||
"""Return a Movie object."""
|
||||
# XXX: not really useful...
|
||||
if 'title' in keywords:
|
||||
if not isinstance(keywords['title'], unicode):
|
||||
keywords['title'] = unicode(keywords['title'],
|
||||
encoding, 'replace')
|
||||
elif len(arguments) > 1:
|
||||
if not isinstance(arguments[1], unicode):
|
||||
arguments[1] = unicode(arguments[1], encoding, 'replace')
|
||||
return Movie.Movie(accessSystem=self.accessSystem,
|
||||
*arguments, **keywords)
|
||||
|
||||
def new_person(self, *arguments, **keywords):
|
||||
"""Return a Person object."""
|
||||
# XXX: not really useful...
|
||||
if 'name' in keywords:
|
||||
if not isinstance(keywords['name'], unicode):
|
||||
keywords['name'] = unicode(keywords['name'],
|
||||
encoding, 'replace')
|
||||
elif len(arguments) > 1:
|
||||
if not isinstance(arguments[1], unicode):
|
||||
arguments[1] = unicode(arguments[1], encoding, 'replace')
|
||||
return Person.Person(accessSystem=self.accessSystem,
|
||||
*arguments, **keywords)
|
||||
|
||||
def new_character(self, *arguments, **keywords):
|
||||
"""Return a Character object."""
|
||||
# XXX: not really useful...
|
||||
if 'name' in keywords:
|
||||
if not isinstance(keywords['name'], unicode):
|
||||
keywords['name'] = unicode(keywords['name'],
|
||||
encoding, 'replace')
|
||||
elif len(arguments) > 1:
|
||||
if not isinstance(arguments[1], unicode):
|
||||
arguments[1] = unicode(arguments[1], encoding, 'replace')
|
||||
return Character.Character(accessSystem=self.accessSystem,
|
||||
*arguments, **keywords)
|
||||
|
||||
def new_company(self, *arguments, **keywords):
|
||||
"""Return a Company object."""
|
||||
# XXX: not really useful...
|
||||
if 'name' in keywords:
|
||||
if not isinstance(keywords['name'], unicode):
|
||||
keywords['name'] = unicode(keywords['name'],
|
||||
encoding, 'replace')
|
||||
elif len(arguments) > 1:
|
||||
if not isinstance(arguments[1], unicode):
|
||||
arguments[1] = unicode(arguments[1], encoding, 'replace')
|
||||
return Company.Company(accessSystem=self.accessSystem,
|
||||
*arguments, **keywords)
|
||||
|
||||
def update(self, mop, info=None, override=0):
|
||||
"""Given a Movie, Person, Character or Company object with only
|
||||
partial information, retrieve the required set of information.
|
||||
|
||||
info is the list of sets of information to retrieve.
|
||||
|
||||
If override is set, the information are retrieved and updated
|
||||
even if they're already in the object."""
|
||||
# XXX: should this be a method of the Movie/Person/Character/Company
|
||||
# classes? NO! What for instances created by external functions?
|
||||
mopID = None
|
||||
prefix = ''
|
||||
if isinstance(mop, Movie.Movie):
|
||||
mopID = mop.movieID
|
||||
prefix = 'movie'
|
||||
elif isinstance(mop, Person.Person):
|
||||
mopID = mop.personID
|
||||
prefix = 'person'
|
||||
elif isinstance(mop, Character.Character):
|
||||
mopID = mop.characterID
|
||||
prefix = 'character'
|
||||
elif isinstance(mop, Company.Company):
|
||||
mopID = mop.companyID
|
||||
prefix = 'company'
|
||||
else:
|
||||
raise IMDbError, 'object ' + repr(mop) + \
|
||||
' is not a Movie, Person, Character or Company instance'
|
||||
if mopID is None:
|
||||
# XXX: enough? It's obvious that there are Characters
|
||||
# objects without characterID, so I think they should
|
||||
# just do nothing, when an i.update(character) is tried.
|
||||
if prefix == 'character':
|
||||
return
|
||||
raise IMDbDataAccessError, \
|
||||
'the supplied object has null movieID, personID or companyID'
|
||||
if mop.accessSystem == self.accessSystem:
|
||||
aSystem = self
|
||||
else:
|
||||
aSystem = IMDb(mop.accessSystem)
|
||||
if info is None:
|
||||
info = mop.default_info
|
||||
elif info == 'all':
|
||||
if isinstance(mop, Movie.Movie):
|
||||
info = self.get_movie_infoset()
|
||||
elif isinstance(mop, Person.Person):
|
||||
info = self.get_person_infoset()
|
||||
elif isinstance(mop, Character.Character):
|
||||
info = self.get_character_infoset()
|
||||
else:
|
||||
info = self.get_company_infoset()
|
||||
if not isinstance(info, (tuple, list)):
|
||||
info = (info,)
|
||||
res = {}
|
||||
for i in info:
|
||||
if i in mop.current_info and not override:
|
||||
continue
|
||||
if not i:
|
||||
continue
|
||||
self._imdb_logger.debug('retrieving "%s" info set', i)
|
||||
try:
|
||||
method = getattr(aSystem, 'get_%s_%s' %
|
||||
(prefix, i.replace(' ', '_')))
|
||||
except AttributeError:
|
||||
self._imdb_logger.error('unknown information set "%s"', i)
|
||||
# Keeps going.
|
||||
method = lambda *x: {}
|
||||
try:
|
||||
ret = method(mopID)
|
||||
except Exception, e:
|
||||
self._imdb_logger.critical('caught an exception retrieving ' \
|
||||
'or parsing "%s" info set for mopID ' \
|
||||
'"%s" (accessSystem: %s)',
|
||||
i, mopID, mop.accessSystem, exc_info=True)
|
||||
ret = {}
|
||||
keys = None
|
||||
if 'data' in ret:
|
||||
res.update(ret['data'])
|
||||
if isinstance(ret['data'], dict):
|
||||
keys = ret['data'].keys()
|
||||
if 'info sets' in ret:
|
||||
for ri in ret['info sets']:
|
||||
mop.add_to_current_info(ri, keys, mainInfoset=i)
|
||||
else:
|
||||
mop.add_to_current_info(i, keys)
|
||||
if 'titlesRefs' in ret:
|
||||
mop.update_titlesRefs(ret['titlesRefs'])
|
||||
if 'namesRefs' in ret:
|
||||
mop.update_namesRefs(ret['namesRefs'])
|
||||
if 'charactersRefs' in ret:
|
||||
mop.update_charactersRefs(ret['charactersRefs'])
|
||||
mop.set_data(res, override=0)
|
||||
|
||||
def get_imdbMovieID(self, movieID):
|
||||
"""Translate a movieID in an imdbID (the ID used by the IMDb
|
||||
web server); must be overridden by the subclass."""
|
||||
# XXX: for the real implementation, see the method of the
|
||||
# subclass, somewhere under the imdb.parser package.
|
||||
raise NotImplementedError, 'override this method'
|
||||
|
||||
def get_imdbPersonID(self, personID):
|
||||
"""Translate a personID in a imdbID (the ID used by the IMDb
|
||||
web server); must be overridden by the subclass."""
|
||||
# XXX: for the real implementation, see the method of the
|
||||
# subclass, somewhere under the imdb.parser package.
|
||||
raise NotImplementedError, 'override this method'
|
||||
|
||||
def get_imdbCharacterID(self, characterID):
|
||||
"""Translate a characterID in a imdbID (the ID used by the IMDb
|
||||
web server); must be overridden by the subclass."""
|
||||
# XXX: for the real implementation, see the method of the
|
||||
# subclass, somewhere under the imdb.parser package.
|
||||
raise NotImplementedError, 'override this method'
|
||||
|
||||
def get_imdbCompanyID(self, companyID):
|
||||
"""Translate a companyID in a imdbID (the ID used by the IMDb
|
||||
web server); must be overridden by the subclass."""
|
||||
# XXX: for the real implementation, see the method of the
|
||||
# subclass, somewhere under the imdb.parser package.
|
||||
raise NotImplementedError, 'override this method'
|
||||
|
||||
def _searchIMDb(self, kind, ton):
|
||||
"""Search the IMDb akas server for the given title or name."""
|
||||
# The Exact Primary search system has gone AWOL, so we resort
|
||||
# to the mobile search. :-/
|
||||
if not ton:
|
||||
return None
|
||||
aSystem = IMDb('mobile')
|
||||
if kind == 'tt':
|
||||
searchFunct = aSystem.search_movie
|
||||
check = 'long imdb canonical title'
|
||||
elif kind == 'nm':
|
||||
searchFunct = aSystem.search_person
|
||||
check = 'long imdb canonical name'
|
||||
elif kind == 'char':
|
||||
searchFunct = aSystem.search_character
|
||||
check = 'long imdb canonical name'
|
||||
elif kind == 'co':
|
||||
# XXX: are [COUNTRY] codes included in the results?
|
||||
searchFunct = aSystem.search_company
|
||||
check = 'long imdb name'
|
||||
try:
|
||||
searchRes = searchFunct(ton)
|
||||
except IMDbError:
|
||||
return None
|
||||
# When only one result is returned, assume it was from an
|
||||
# exact match.
|
||||
if len(searchRes) == 1:
|
||||
return searchRes[0].getID()
|
||||
for item in searchRes:
|
||||
# Return the first perfect match.
|
||||
if item[check] == ton:
|
||||
return item.getID()
|
||||
return None
|
||||
|
||||
def title2imdbID(self, title):
|
||||
"""Translate a movie title (in the plain text data files format)
|
||||
to an imdbID.
|
||||
Try an Exact Primary Title search on IMDb;
|
||||
return None if it's unable to get the imdbID."""
|
||||
return self._searchIMDb('tt', title)
|
||||
|
||||
def name2imdbID(self, name):
|
||||
"""Translate a person name in an imdbID.
|
||||
Try an Exact Primary Name search on IMDb;
|
||||
return None if it's unable to get the imdbID."""
|
||||
return self._searchIMDb('tt', name)
|
||||
|
||||
def character2imdbID(self, name):
|
||||
"""Translate a character name in an imdbID.
|
||||
Try an Exact Primary Name search on IMDb;
|
||||
return None if it's unable to get the imdbID."""
|
||||
return self._searchIMDb('char', name)
|
||||
|
||||
def company2imdbID(self, name):
|
||||
"""Translate a company name in an imdbID.
|
||||
Try an Exact Primary Name search on IMDb;
|
||||
return None if it's unable to get the imdbID."""
|
||||
return self._searchIMDb('co', name)
|
||||
|
||||
def get_imdbID(self, mop):
|
||||
"""Return the imdbID for the given Movie, Person, Character or Company
|
||||
object."""
|
||||
imdbID = None
|
||||
if mop.accessSystem == self.accessSystem:
|
||||
aSystem = self
|
||||
else:
|
||||
aSystem = IMDb(mop.accessSystem)
|
||||
if isinstance(mop, Movie.Movie):
|
||||
if mop.movieID is not None:
|
||||
imdbID = aSystem.get_imdbMovieID(mop.movieID)
|
||||
else:
|
||||
imdbID = aSystem.title2imdbID(build_title(mop, canonical=0,
|
||||
ptdf=1))
|
||||
elif isinstance(mop, Person.Person):
|
||||
if mop.personID is not None:
|
||||
imdbID = aSystem.get_imdbPersonID(mop.personID)
|
||||
else:
|
||||
imdbID = aSystem.name2imdbID(build_name(mop, canonical=1))
|
||||
elif isinstance(mop, Character.Character):
|
||||
if mop.characterID is not None:
|
||||
imdbID = aSystem.get_imdbCharacterID(mop.characterID)
|
||||
else:
|
||||
# canonical=0 ?
|
||||
imdbID = aSystem.character2imdbID(build_name(mop, canonical=1))
|
||||
elif isinstance(mop, Company.Company):
|
||||
if mop.companyID is not None:
|
||||
imdbID = aSystem.get_imdbCompanyID(mop.companyID)
|
||||
else:
|
||||
imdbID = aSystem.company2imdbID(build_company_name(mop))
|
||||
else:
|
||||
raise IMDbError, 'object ' + repr(mop) + \
|
||||
' is not a Movie, Person or Character instance'
|
||||
return imdbID
|
||||
|
||||
def get_imdbURL(self, mop):
|
||||
"""Return the main IMDb URL for the given Movie, Person,
|
||||
Character or Company object, or None if unable to get it."""
|
||||
imdbID = self.get_imdbID(mop)
|
||||
if imdbID is None:
|
||||
return None
|
||||
if isinstance(mop, Movie.Movie):
|
||||
url_firstPart = imdbURL_movie_main
|
||||
elif isinstance(mop, Person.Person):
|
||||
url_firstPart = imdbURL_person_main
|
||||
elif isinstance(mop, Character.Character):
|
||||
url_firstPart = imdbURL_character_main
|
||||
elif isinstance(mop, Company.Company):
|
||||
url_firstPart = imdbURL_company_main
|
||||
else:
|
||||
raise IMDbError, 'object ' + repr(mop) + \
|
||||
' is not a Movie, Person, Character or Company instance'
|
||||
return url_firstPart % imdbID
|
||||
|
||||
def get_special_methods(self):
|
||||
"""Return the special methods defined by the subclass."""
|
||||
sm_dict = {}
|
||||
base_methods = []
|
||||
for name in dir(IMDbBase):
|
||||
member = getattr(IMDbBase, name)
|
||||
if isinstance(member, MethodType):
|
||||
base_methods.append(name)
|
||||
for name in dir(self.__class__):
|
||||
if name.startswith('_') or name in base_methods or \
|
||||
name.startswith('get_movie_') or \
|
||||
name.startswith('get_person_') or \
|
||||
name.startswith('get_company_') or \
|
||||
name.startswith('get_character_'):
|
||||
continue
|
||||
member = getattr(self.__class__, name)
|
||||
if isinstance(member, MethodType):
|
||||
sm_dict.update({name: member.__doc__})
|
||||
return sm_dict
|
||||
|
||||
@@ -1,72 +0,0 @@
|
||||
"""
|
||||
_compat module (imdb package).
|
||||
|
||||
This module provides compatibility functions used by the imdb package
|
||||
to deal with unusual environments.
|
||||
|
||||
Copyright 2008-2010 Davide Alberani <da@erlug.linux.it>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
"""
|
||||
|
||||
# TODO: now we're heavily using the 'logging' module, which was not
|
||||
# present in Python 2.2. To work in a Symbian environment, we
|
||||
# need to create a fake 'logging' module (its functions may call
|
||||
# the 'warnings' module, or do nothing at all).
|
||||
|
||||
|
||||
import os
|
||||
# If true, we're working on a Symbian device.
|
||||
if os.name == 'e32':
|
||||
# Replace os.path.expandvars and os.path.expanduser, if needed.
|
||||
def _noact(x):
|
||||
"""Ad-hoc replacement for IMDbPY."""
|
||||
return x
|
||||
try:
|
||||
os.path.expandvars
|
||||
except AttributeError:
|
||||
os.path.expandvars = _noact
|
||||
try:
|
||||
os.path.expanduser
|
||||
except AttributeError:
|
||||
os.path.expanduser = _noact
|
||||
|
||||
# time.strptime is missing, on Symbian devices.
|
||||
import time
|
||||
try:
|
||||
time.strptime
|
||||
except AttributeError:
|
||||
import re
|
||||
_re_web_time = re.compile(r'Episode dated (\d+) (\w+) (\d+)')
|
||||
_re_ptdf_time = re.compile(r'\((\d+)-(\d+)-(\d+)\)')
|
||||
_month2digit = {'January': '1', 'February': '2', 'March': '3',
|
||||
'April': '4', 'May': '5', 'June': '6', 'July': '7',
|
||||
'August': '8', 'September': '9', 'October': '10',
|
||||
'November': '11', 'December': '12'}
|
||||
def strptime(s, format):
|
||||
"""Ad-hoc strptime replacement for IMDbPY."""
|
||||
try:
|
||||
if format.startswith('Episode'):
|
||||
res = _re_web_time.findall(s)[0]
|
||||
return (int(res[2]), int(_month2digit[res[1]]), int(res[0]),
|
||||
0, 0, 0, 0, 1, 0)
|
||||
else:
|
||||
res = _re_ptdf_time.findall(s)[0]
|
||||
return (int(res[0]), int(res[1]), int(res[2]),
|
||||
0, 0, 0, 0, 1, 0)
|
||||
except:
|
||||
raise ValueError, u'error in IMDbPY\'s ad-hoc strptime!'
|
||||
time.strptime = strptime
|
||||
|
||||
@@ -1,46 +0,0 @@
|
||||
"""
|
||||
_exceptions module (imdb package).
|
||||
|
||||
This module provides the exception hierarchy used by the imdb package.
|
||||
|
||||
Copyright 2004-2009 Davide Alberani <da@erlug.linux.it>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
|
||||
class IMDbError(Exception, object):
|
||||
"""Base class for every exception raised by the imdb package."""
|
||||
_logger = logging.getLogger('imdbpy')
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
"""Initialize the exception and pass the message to the log system."""
|
||||
# Every raised exception also dispatch a critical log.
|
||||
self._logger.critical('%s exception raised; args: %s; kwds: %s',
|
||||
self.__class__.__name__, args, kwargs,
|
||||
exc_info=True)
|
||||
super(IMDbError, self).__init__(*args, **kwargs)
|
||||
|
||||
class IMDbDataAccessError(IMDbError):
|
||||
"""Exception raised when is not possible to access needed data."""
|
||||
pass
|
||||
|
||||
class IMDbParserError(IMDbError):
|
||||
"""Exception raised when an error occurred parsing the data."""
|
||||
pass
|
||||
|
||||
|
||||
@@ -1,63 +0,0 @@
|
||||
"""
|
||||
_logging module (imdb package).
|
||||
|
||||
This module provides the logging facilities used by the imdb package.
|
||||
|
||||
Copyright 2009-2010 Davide Alberani <da@erlug.linux.it>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
LEVELS = {'debug': logging.DEBUG,
|
||||
'info': logging.INFO,
|
||||
'warn': logging.WARNING,
|
||||
'warning': logging.WARNING,
|
||||
'error': logging.ERROR,
|
||||
'critical': logging.CRITICAL}
|
||||
|
||||
|
||||
imdbpyLogger = logging.getLogger('imdbpy')
|
||||
imdbpyStreamHandler = logging.StreamHandler()
|
||||
imdbpyFormatter = logging.Formatter('%(asctime)s %(levelname)s [%(name)s]' \
|
||||
' %(pathname)s:%(lineno)d: %(message)s')
|
||||
imdbpyStreamHandler.setFormatter(imdbpyFormatter)
|
||||
imdbpyLogger.addHandler(imdbpyStreamHandler)
|
||||
|
||||
def setLevel(level):
|
||||
"""Set logging level for the main logger."""
|
||||
level = level.lower().strip()
|
||||
imdbpyLogger.setLevel(LEVELS.get(level, logging.NOTSET))
|
||||
imdbpyLogger.log(imdbpyLogger.level, 'set logging threshold to "%s"',
|
||||
logging.getLevelName(imdbpyLogger.level))
|
||||
|
||||
|
||||
#imdbpyLogger.setLevel(logging.DEBUG)
|
||||
|
||||
|
||||
# It can be an idea to have a single function to log and warn:
|
||||
#import warnings
|
||||
#def log_and_warn(msg, args=None, logger=None, level=None):
|
||||
# """Log the message and issue a warning."""
|
||||
# if logger is None:
|
||||
# logger = imdbpyLogger
|
||||
# if level is None:
|
||||
# level = logging.WARNING
|
||||
# if args is None:
|
||||
# args = ()
|
||||
# #warnings.warn(msg % args, stacklevel=0)
|
||||
# logger.log(level, msg % args)
|
||||
|
||||
@@ -1,142 +0,0 @@
|
||||
"""
|
||||
articles module (imdb package).
|
||||
|
||||
This module provides functions and data to handle in a smart way
|
||||
articles (in various languages) at the beginning of movie titles.
|
||||
|
||||
Copyright 2009 Davide Alberani <da@erlug.linux.it>
|
||||
2009 H. Turgut Uyar <uyar@tekir.org>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
"""
|
||||
|
||||
# List of generic articles used when the language of the title is unknown (or
|
||||
# we don't have information about articles in that language).
|
||||
# XXX: Managing titles in a lot of different languages, a function to recognize
|
||||
# an initial article can't be perfect; sometimes we'll stumble upon a short
|
||||
# word that is an article in some language, but it's not in another; in these
|
||||
# situations we have to choose if we want to interpret this little word
|
||||
# as an article or not (remember that we don't know what the original language
|
||||
# of the title was).
|
||||
# Example: 'en' is (I suppose) an article in Some Language. Unfortunately it
|
||||
# seems also to be a preposition in other languages (French?).
|
||||
# Running a script over the whole list of titles (and aliases), I've found
|
||||
# that 'en' is used as an article only 376 times, and as another thing 594
|
||||
# times, so I've decided to _always_ consider 'en' as a non article.
|
||||
#
|
||||
# Here is a list of words that are _never_ considered as articles, complete
|
||||
# with the cound of times they are used in a way or another:
|
||||
# 'en' (376 vs 594), 'to' (399 vs 727), 'as' (198 vs 276), 'et' (79 vs 99),
|
||||
# 'des' (75 vs 150), 'al' (78 vs 304), 'ye' (14 vs 70),
|
||||
# 'da' (23 vs 298), "'n" (8 vs 12)
|
||||
#
|
||||
# I've left in the list 'i' (1939 vs 2151) and 'uno' (52 vs 56)
|
||||
# I'm not sure what '-al' is, and so I've left it out...
|
||||
#
|
||||
# Generic list of articles in utf-8 encoding:
|
||||
GENERIC_ARTICLES = ('the', 'la', 'a', 'die', 'der', 'le', 'el',
|
||||
"l'", 'il', 'das', 'les', 'i', 'o', 'ein', 'un', 'de', 'los',
|
||||
'an', 'una', 'las', 'eine', 'den', 'het', 'gli', 'lo', 'os',
|
||||
'ang', 'oi', 'az', 'een', 'ha-', 'det', 'ta', 'al-',
|
||||
'mga', "un'", 'uno', 'ett', 'dem', 'egy', 'els', 'eines',
|
||||
'\xc3\x8f', '\xc3\x87', '\xc3\x94\xc3\xaf', '\xc3\x8f\xc3\xa9')
|
||||
|
||||
|
||||
# Lists of articles separated by language. If possible, the list should
|
||||
# be sorted by frequency (not very important, but...)
|
||||
# If you want to add a list of articles for another language, mail it
|
||||
# it at imdbpy-devel@lists.sourceforge.net; non-ascii articles must be utf-8
|
||||
# encoded.
|
||||
LANG_ARTICLES = {
|
||||
'English': ('the', 'a', 'an'),
|
||||
'Italian': ('la', 'le', "l'", 'il', 'i', 'un', 'una', 'gli', 'lo', "un'",
|
||||
'uno'),
|
||||
'Spanish': ('la', 'le', 'el', 'les', 'un', 'los', 'una', 'uno', 'unos',
|
||||
'unas'),
|
||||
'Portuguese': ('a', 'as', 'o', 'os', 'um', 'uns', 'uma', 'umas'),
|
||||
'Turkish': (), # Some languages doesn't have articles.
|
||||
}
|
||||
LANG_ARTICLESget = LANG_ARTICLES.get
|
||||
|
||||
|
||||
# Maps a language to countries where it is the main language.
|
||||
# If you want to add an entry for another language or country, mail it at
|
||||
# imdbpy-devel@lists.sourceforge.net .
|
||||
_LANG_COUNTRIES = {
|
||||
'English': ('USA', 'UK', 'Canada', 'Ireland', 'Australia'),
|
||||
'Italian': ('Italy',),
|
||||
'Spanish': ('Spain', 'Mexico'),
|
||||
'Portuguese': ('Portugal', 'Brazil'),
|
||||
'Turkish': ('Turkey',),
|
||||
#'German': ('Germany', 'East Germany', 'West Germany'),
|
||||
#'French': ('France'),
|
||||
}
|
||||
|
||||
# Maps countries to their main language.
|
||||
COUNTRY_LANG = {}
|
||||
for lang in _LANG_COUNTRIES:
|
||||
for country in _LANG_COUNTRIES[lang]:
|
||||
COUNTRY_LANG[country] = lang
|
||||
|
||||
|
||||
def toUnicode(articles):
|
||||
"""Convert a list of articles utf-8 encoded to unicode strings."""
|
||||
return tuple([art.decode('utf_8') for art in articles])
|
||||
|
||||
|
||||
def toDicts(articles):
|
||||
"""Given a list of utf-8 encoded articles, build two dictionary (one
|
||||
utf-8 encoded and another one with unicode keys) for faster matches."""
|
||||
uArticles = toUnicode(articles)
|
||||
return dict([(x, x) for x in articles]), dict([(x, x) for x in uArticles])
|
||||
|
||||
|
||||
def addTrailingSpace(articles):
|
||||
"""From the given list of utf-8 encoded articles, return two
|
||||
lists (one utf-8 encoded and another one in unicode) where a space
|
||||
is added at the end - if the last char is not ' or -."""
|
||||
_spArticles = []
|
||||
_spUnicodeArticles = []
|
||||
for article in articles:
|
||||
if article[-1] not in ("'", '-'):
|
||||
article += ' '
|
||||
_spArticles.append(article)
|
||||
_spUnicodeArticles.append(article.decode('utf_8'))
|
||||
return _spArticles, _spUnicodeArticles
|
||||
|
||||
|
||||
# Caches.
|
||||
_ART_CACHE = {}
|
||||
_SP_ART_CACHE = {}
|
||||
|
||||
def articlesDictsForLang(lang):
|
||||
"""Return dictionaries of articles specific for the given language, or the
|
||||
default one if the language is not known."""
|
||||
if lang in _ART_CACHE:
|
||||
return _ART_CACHE[lang]
|
||||
artDicts = toDicts(LANG_ARTICLESget(lang, GENERIC_ARTICLES))
|
||||
_ART_CACHE[lang] = artDicts
|
||||
return artDicts
|
||||
|
||||
|
||||
def spArticlesForLang(lang):
|
||||
"""Return lists of articles (plus optional spaces) specific for the
|
||||
given language, or the default one if the language is not known."""
|
||||
if lang in _SP_ART_CACHE:
|
||||
return _SP_ART_CACHE[lang]
|
||||
spArticles = addTrailingSpace(LANG_ARTICLESget(lang, GENERIC_ARTICLES))
|
||||
_SP_ART_CACHE[lang] = spArticles
|
||||
return spArticles
|
||||
|
||||
@@ -1,548 +0,0 @@
|
||||
"""
|
||||
helpers module (imdb package).
|
||||
|
||||
This module provides functions not used directly by the imdb package,
|
||||
but useful for IMDbPY-based programs.
|
||||
|
||||
Copyright 2006-2010 Davide Alberani <da@erlug.linux.it>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
"""
|
||||
|
||||
# XXX: find better names for the functions in this modules.
|
||||
|
||||
import re
|
||||
from cgi import escape
|
||||
import gettext
|
||||
from gettext import gettext as _
|
||||
gettext.textdomain('imdbpy')
|
||||
|
||||
# The modClearRefs can be used to strip names and titles references from
|
||||
# the strings in Movie and Person objects.
|
||||
from imdb.utils import modClearRefs, re_titleRef, re_nameRef, \
|
||||
re_characterRef, _tagAttr, _Container, TAGS_TO_MODIFY
|
||||
from imdb import IMDb, imdbURL_movie_base, imdbURL_person_base, \
|
||||
imdbURL_character_base
|
||||
import imdb.locale
|
||||
from imdb.Movie import Movie
|
||||
from imdb.Person import Person
|
||||
from imdb.Character import Character
|
||||
from imdb.Company import Company
|
||||
from imdb.parser.http.utils import re_entcharrefssub, entcharrefs, \
|
||||
subXMLRefs, subSGMLRefs
|
||||
from imdb.parser.http.bsouplxml.etree import BeautifulSoup
|
||||
|
||||
|
||||
# An URL, more or less.
|
||||
_re_href = re.compile(r'(http://.+?)(?=\s|$)', re.I)
|
||||
_re_hrefsub = _re_href.sub
|
||||
|
||||
|
||||
def makeCgiPrintEncoding(encoding):
|
||||
"""Make a function to pretty-print strings for the web."""
|
||||
def cgiPrint(s):
|
||||
"""Encode the given string using the %s encoding, and replace
|
||||
chars outside the given charset with XML char references.""" % encoding
|
||||
s = escape(s, quote=1)
|
||||
if isinstance(s, unicode):
|
||||
s = s.encode(encoding, 'xmlcharrefreplace')
|
||||
return s
|
||||
return cgiPrint
|
||||
|
||||
# cgiPrint uses the latin_1 encoding.
|
||||
cgiPrint = makeCgiPrintEncoding('latin_1')
|
||||
|
||||
# Regular expression for %(varname)s substitutions.
|
||||
re_subst = re.compile(r'%\((.+?)\)s')
|
||||
# Regular expression for <if condition>....</if condition> clauses.
|
||||
re_conditional = re.compile(r'<if\s+(.+?)\s*>(.+?)</if\s+\1\s*>')
|
||||
|
||||
|
||||
def makeTextNotes(replaceTxtNotes):
|
||||
"""Create a function useful to handle text[::optional_note] values.
|
||||
replaceTxtNotes is a format string, which can include the following
|
||||
values: %(text)s and %(notes)s.
|
||||
Portions of the text can be conditionally excluded, if one of the
|
||||
values is absent. E.g.: <if notes>[%(notes)s]</if notes> will be replaced
|
||||
with '[notes]' if notes exists, or by an empty string otherwise.
|
||||
The returned function is suitable be passed as applyToValues argument
|
||||
of the makeObject2Txt function."""
|
||||
def _replacer(s):
|
||||
outS = replaceTxtNotes
|
||||
if not isinstance(s, (unicode, str)):
|
||||
return s
|
||||
ssplit = s.split('::', 1)
|
||||
text = ssplit[0]
|
||||
# Used to keep track of text and note existence.
|
||||
keysDict = {}
|
||||
if text:
|
||||
keysDict['text'] = True
|
||||
outS = outS.replace('%(text)s', text)
|
||||
if len(ssplit) == 2:
|
||||
keysDict['notes'] = True
|
||||
outS = outS.replace('%(notes)s', ssplit[1])
|
||||
else:
|
||||
outS = outS.replace('%(notes)s', u'')
|
||||
def _excludeFalseConditionals(matchobj):
|
||||
# Return an empty string if the conditional is false/empty.
|
||||
if matchobj.group(1) in keysDict:
|
||||
return matchobj.group(2)
|
||||
return u''
|
||||
while re_conditional.search(outS):
|
||||
outS = re_conditional.sub(_excludeFalseConditionals, outS)
|
||||
return outS
|
||||
return _replacer
|
||||
|
||||
|
||||
def makeObject2Txt(movieTxt=None, personTxt=None, characterTxt=None,
|
||||
companyTxt=None, joiner=' / ',
|
||||
applyToValues=lambda x: x, _recurse=True):
|
||||
""""Return a function useful to pretty-print Movie, Person,
|
||||
Character and Company instances.
|
||||
|
||||
*movieTxt* -- how to format a Movie object.
|
||||
*personTxt* -- how to format a Person object.
|
||||
*characterTxt* -- how to format a Character object.
|
||||
*companyTxt* -- how to format a Company object.
|
||||
*joiner* -- string used to join a list of objects.
|
||||
*applyToValues* -- function to apply to values.
|
||||
*_recurse* -- if True (default) manage only the given object.
|
||||
"""
|
||||
# Some useful defaults.
|
||||
if movieTxt is None:
|
||||
movieTxt = '%(long imdb title)s'
|
||||
if personTxt is None:
|
||||
personTxt = '%(long imdb name)s'
|
||||
if characterTxt is None:
|
||||
characterTxt = '%(long imdb name)s'
|
||||
if companyTxt is None:
|
||||
companyTxt = '%(long imdb name)s'
|
||||
def object2txt(obj, _limitRecursion=None):
|
||||
"""Pretty-print objects."""
|
||||
# Prevent unlimited recursion.
|
||||
if _limitRecursion is None:
|
||||
_limitRecursion = 0
|
||||
elif _limitRecursion > 5:
|
||||
return u''
|
||||
_limitRecursion += 1
|
||||
if isinstance(obj, (list, tuple)):
|
||||
return joiner.join([object2txt(o, _limitRecursion=_limitRecursion)
|
||||
for o in obj])
|
||||
elif isinstance(obj, dict):
|
||||
# XXX: not exactly nice, neither useful, I fear.
|
||||
return joiner.join([u'%s::%s' %
|
||||
(object2txt(k, _limitRecursion=_limitRecursion),
|
||||
object2txt(v, _limitRecursion=_limitRecursion))
|
||||
for k, v in obj.items()])
|
||||
objData = {}
|
||||
if isinstance(obj, Movie):
|
||||
objData['movieID'] = obj.movieID
|
||||
outs = movieTxt
|
||||
elif isinstance(obj, Person):
|
||||
objData['personID'] = obj.personID
|
||||
outs = personTxt
|
||||
elif isinstance(obj, Character):
|
||||
objData['characterID'] = obj.characterID
|
||||
outs = characterTxt
|
||||
elif isinstance(obj, Company):
|
||||
objData['companyID'] = obj.companyID
|
||||
outs = companyTxt
|
||||
else:
|
||||
return obj
|
||||
def _excludeFalseConditionals(matchobj):
|
||||
# Return an empty string if the conditional is false/empty.
|
||||
condition = matchobj.group(1)
|
||||
proceed = obj.get(condition) or getattr(obj, condition, None)
|
||||
if proceed:
|
||||
return matchobj.group(2)
|
||||
else:
|
||||
return u''
|
||||
return matchobj.group(2)
|
||||
while re_conditional.search(outs):
|
||||
outs = re_conditional.sub(_excludeFalseConditionals, outs)
|
||||
for key in re_subst.findall(outs):
|
||||
value = obj.get(key) or getattr(obj, key, None)
|
||||
if not isinstance(value, (unicode, str)):
|
||||
if not _recurse:
|
||||
if value:
|
||||
value = unicode(value)
|
||||
if value:
|
||||
value = object2txt(value, _limitRecursion=_limitRecursion)
|
||||
elif value:
|
||||
value = applyToValues(unicode(value))
|
||||
if not value:
|
||||
value = u''
|
||||
elif not isinstance(value, (unicode, str)):
|
||||
value = unicode(value)
|
||||
outs = outs.replace(u'%(' + key + u')s', value)
|
||||
return outs
|
||||
return object2txt
|
||||
|
||||
|
||||
def makeModCGILinks(movieTxt, personTxt, characterTxt=None,
|
||||
encoding='latin_1'):
|
||||
"""Make a function used to pretty-print movies and persons refereces;
|
||||
movieTxt and personTxt are the strings used for the substitutions.
|
||||
movieTxt must contains %(movieID)s and %(title)s, while personTxt
|
||||
must contains %(personID)s and %(name)s and characterTxt %(characterID)s
|
||||
and %(name)s; characterTxt is optional, for backward compatibility."""
|
||||
_cgiPrint = makeCgiPrintEncoding(encoding)
|
||||
def modCGILinks(s, titlesRefs, namesRefs, characterRefs=None):
|
||||
"""Substitute movies and persons references."""
|
||||
if characterRefs is None: characterRefs = {}
|
||||
# XXX: look ma'... more nested scopes! <g>
|
||||
def _replaceMovie(match):
|
||||
to_replace = match.group(1)
|
||||
item = titlesRefs.get(to_replace)
|
||||
if item:
|
||||
movieID = item.movieID
|
||||
to_replace = movieTxt % {'movieID': movieID,
|
||||
'title': unicode(_cgiPrint(to_replace),
|
||||
encoding,
|
||||
'xmlcharrefreplace')}
|
||||
return to_replace
|
||||
def _replacePerson(match):
|
||||
to_replace = match.group(1)
|
||||
item = namesRefs.get(to_replace)
|
||||
if item:
|
||||
personID = item.personID
|
||||
to_replace = personTxt % {'personID': personID,
|
||||
'name': unicode(_cgiPrint(to_replace),
|
||||
encoding,
|
||||
'xmlcharrefreplace')}
|
||||
return to_replace
|
||||
def _replaceCharacter(match):
|
||||
to_replace = match.group(1)
|
||||
if characterTxt is None:
|
||||
return to_replace
|
||||
item = characterRefs.get(to_replace)
|
||||
if item:
|
||||
characterID = item.characterID
|
||||
if characterID is None:
|
||||
return to_replace
|
||||
to_replace = characterTxt % {'characterID': characterID,
|
||||
'name': unicode(_cgiPrint(to_replace),
|
||||
encoding,
|
||||
'xmlcharrefreplace')}
|
||||
return to_replace
|
||||
s = s.replace('<', '<').replace('>', '>')
|
||||
s = _re_hrefsub(r'<a href="\1">\1</a>', s)
|
||||
s = re_titleRef.sub(_replaceMovie, s)
|
||||
s = re_nameRef.sub(_replacePerson, s)
|
||||
s = re_characterRef.sub(_replaceCharacter, s)
|
||||
return s
|
||||
modCGILinks.movieTxt = movieTxt
|
||||
modCGILinks.personTxt = personTxt
|
||||
modCGILinks.characterTxt = characterTxt
|
||||
return modCGILinks
|
||||
|
||||
# links to the imdb.com web site.
|
||||
_movieTxt = '<a href="' + imdbURL_movie_base + 'tt%(movieID)s">%(title)s</a>'
|
||||
_personTxt = '<a href="' + imdbURL_person_base + 'nm%(personID)s">%(name)s</a>'
|
||||
_characterTxt = '<a href="' + imdbURL_character_base + \
|
||||
'ch%(characterID)s">%(name)s</a>'
|
||||
modHtmlLinks = makeModCGILinks(movieTxt=_movieTxt, personTxt=_personTxt,
|
||||
characterTxt=_characterTxt)
|
||||
modHtmlLinksASCII = makeModCGILinks(movieTxt=_movieTxt, personTxt=_personTxt,
|
||||
characterTxt=_characterTxt,
|
||||
encoding='ascii')
|
||||
|
||||
|
||||
everyentcharrefs = entcharrefs.copy()
|
||||
for k, v in {'lt':u'<','gt':u'>','amp':u'&','quot':u'"','apos':u'\''}.items():
|
||||
everyentcharrefs[k] = v
|
||||
everyentcharrefs['#%s' % ord(v)] = v
|
||||
everyentcharrefsget = everyentcharrefs.get
|
||||
re_everyentcharrefs = re.compile('&(%s|\#160|\#\d{1,5});' %
|
||||
'|'.join(map(re.escape, everyentcharrefs)))
|
||||
re_everyentcharrefssub = re_everyentcharrefs.sub
|
||||
|
||||
def _replAllXMLRef(match):
|
||||
"""Replace the matched XML reference."""
|
||||
ref = match.group(1)
|
||||
value = everyentcharrefsget(ref)
|
||||
if value is None:
|
||||
if ref[0] == '#':
|
||||
return unichr(int(ref[1:]))
|
||||
else:
|
||||
return ref
|
||||
return value
|
||||
|
||||
def subXMLHTMLSGMLRefs(s):
|
||||
"""Return the given string with XML/HTML/SGML entity and char references
|
||||
replaced."""
|
||||
return re_everyentcharrefssub(_replAllXMLRef, s)
|
||||
|
||||
|
||||
def sortedSeasons(m):
|
||||
"""Return a sorted list of seasons of the given series."""
|
||||
seasons = m.get('episodes', {}).keys()
|
||||
seasons.sort()
|
||||
return seasons
|
||||
|
||||
|
||||
def sortedEpisodes(m, season=None):
|
||||
"""Return a sorted list of episodes of the given series,
|
||||
considering only the specified season(s) (every season, if None)."""
|
||||
episodes = []
|
||||
seasons = season
|
||||
if season is None:
|
||||
seasons = sortedSeasons(m)
|
||||
else:
|
||||
if not isinstance(season, (tuple, list)):
|
||||
seasons = [season]
|
||||
for s in seasons:
|
||||
eps_indx = m.get('episodes', {}).get(s, {}).keys()
|
||||
eps_indx.sort()
|
||||
for e in eps_indx:
|
||||
episodes.append(m['episodes'][s][e])
|
||||
return episodes
|
||||
|
||||
|
||||
# Idea and portions of the code courtesy of none none (dclist at gmail.com)
|
||||
_re_imdbIDurl = re.compile(r'\b(nm|tt|ch|co)([0-9]{7})\b')
|
||||
def get_byURL(url, info=None, args=None, kwds=None):
|
||||
"""Return a Movie, Person, Character or Company object for the given URL;
|
||||
info is the info set to retrieve, args and kwds are respectively a list
|
||||
and a dictionary or arguments to initialize the data access system.
|
||||
Returns None if unable to correctly parse the url; can raise
|
||||
exceptions if unable to retrieve the data."""
|
||||
if args is None: args = []
|
||||
if kwds is None: kwds = {}
|
||||
ia = IMDb(*args, **kwds)
|
||||
match = _re_imdbIDurl.search(url)
|
||||
if not match:
|
||||
return None
|
||||
imdbtype = match.group(1)
|
||||
imdbID = match.group(2)
|
||||
if imdbtype == 'tt':
|
||||
return ia.get_movie(imdbID, info=info)
|
||||
elif imdbtype == 'nm':
|
||||
return ia.get_person(imdbID, info=info)
|
||||
elif imdbtype == 'ch':
|
||||
return ia.get_character(imdbID, info=info)
|
||||
elif imdbtype == 'co':
|
||||
return ia.get_company(imdbID, info=info)
|
||||
return None
|
||||
|
||||
|
||||
# Idea and portions of code courtesy of Basil Shubin.
|
||||
# Beware that these information are now available directly by
|
||||
# the Movie/Person/Character instances.
|
||||
def fullSizeCoverURL(obj):
|
||||
"""Given an URL string or a Movie, Person or Character instance,
|
||||
returns an URL to the full-size version of the cover/headshot,
|
||||
or None otherwise. This function is obsolete: the same information
|
||||
are available as keys: 'full-size cover url' and 'full-size headshot',
|
||||
respectively for movies and persons/characters."""
|
||||
if isinstance(obj, Movie):
|
||||
coverUrl = obj.get('cover url')
|
||||
elif isinstance(obj, (Person, Character)):
|
||||
coverUrl = obj.get('headshot')
|
||||
else:
|
||||
coverUrl = obj
|
||||
if not coverUrl:
|
||||
return None
|
||||
return _Container._re_fullsizeURL.sub('', coverUrl)
|
||||
|
||||
|
||||
def keyToXML(key):
|
||||
"""Return a key (the ones used to access information in Movie and
|
||||
other classes instances) converted to the style of the XML output."""
|
||||
return _tagAttr(key, '')[0]
|
||||
|
||||
|
||||
def translateKey(key):
|
||||
"""Translate a given key."""
|
||||
return _(keyToXML(key))
|
||||
|
||||
|
||||
# Maps tags to classes.
|
||||
_MAP_TOP_OBJ = {
|
||||
'person': Person,
|
||||
'movie': Movie,
|
||||
'character': Character,
|
||||
'company': Company
|
||||
}
|
||||
|
||||
# Tags to be converted to lists.
|
||||
_TAGS_TO_LIST = dict([(x[0], None) for x in TAGS_TO_MODIFY.values()])
|
||||
_TAGS_TO_LIST.update(_MAP_TOP_OBJ)
|
||||
|
||||
def tagToKey(tag):
|
||||
"""Return the name of the tag, taking it from the 'key' attribute,
|
||||
if present."""
|
||||
keyAttr = tag.get('key')
|
||||
if keyAttr:
|
||||
if tag.get('keytype') == 'int':
|
||||
keyAttr = int(keyAttr)
|
||||
return keyAttr
|
||||
return tag.name
|
||||
|
||||
|
||||
def _valueWithType(tag, tagValue):
|
||||
"""Return tagValue, handling some type conversions."""
|
||||
tagType = tag.get('type')
|
||||
if tagType == 'int':
|
||||
tagValue = int(tagValue)
|
||||
elif tagType == 'float':
|
||||
tagValue = float(tagValue)
|
||||
return tagValue
|
||||
|
||||
|
||||
# Extra tags to get (if values were not already read from title/name).
|
||||
_titleTags = ('imdbindex', 'kind', 'year')
|
||||
_nameTags = ('imdbindex')
|
||||
_companyTags = ('imdbindex', 'country')
|
||||
|
||||
def parseTags(tag, _topLevel=True, _as=None, _infoset2keys=None,
|
||||
_key2infoset=None):
|
||||
"""Recursively parse a tree of tags."""
|
||||
# The returned object (usually a _Container subclass, but it can
|
||||
# be a string, an int, a float, a list or a dictionary).
|
||||
item = None
|
||||
if _infoset2keys is None:
|
||||
_infoset2keys = {}
|
||||
if _key2infoset is None:
|
||||
_key2infoset = {}
|
||||
name = tagToKey(tag)
|
||||
firstChild = tag.find(recursive=False)
|
||||
tagStr = (tag.string or u'').strip()
|
||||
if not tagStr and name == 'item':
|
||||
# Handles 'item' tags containing text and a 'notes' sub-tag.
|
||||
tagContent = tag.contents[0]
|
||||
if isinstance(tagContent, BeautifulSoup.NavigableString):
|
||||
tagStr = (unicode(tagContent) or u'').strip()
|
||||
tagType = tag.get('type')
|
||||
infoset = tag.get('infoset')
|
||||
if infoset:
|
||||
_key2infoset[name] = infoset
|
||||
_infoset2keys.setdefault(infoset, []).append(name)
|
||||
# Here we use tag.name to avoid tags like <item title="company">
|
||||
if tag.name in _MAP_TOP_OBJ:
|
||||
# One of the subclasses of _Container.
|
||||
item = _MAP_TOP_OBJ[name]()
|
||||
itemAs = tag.get('access-system')
|
||||
if itemAs:
|
||||
if not _as:
|
||||
_as = itemAs
|
||||
else:
|
||||
itemAs = _as
|
||||
item.accessSystem = itemAs
|
||||
tagsToGet = []
|
||||
theID = tag.get('id')
|
||||
if name == 'movie':
|
||||
item.movieID = theID
|
||||
tagsToGet = _titleTags
|
||||
theTitle = tag.find('title', recursive=False)
|
||||
if tag.title:
|
||||
item.set_title(tag.title.string)
|
||||
tag.title.extract()
|
||||
else:
|
||||
if name == 'person':
|
||||
item.personID = theID
|
||||
tagsToGet = _nameTags
|
||||
theName = tag.find('long imdb canonical name', recursive=False)
|
||||
if not theName:
|
||||
theName = tag.find('name', recursive=False)
|
||||
elif name == 'character':
|
||||
item.characterID = theID
|
||||
tagsToGet = _nameTags
|
||||
theName = tag.find('name', recursive=False)
|
||||
elif name == 'company':
|
||||
item.companyID = theID
|
||||
tagsToGet = _companyTags
|
||||
theName = tag.find('name', recursive=False)
|
||||
if theName:
|
||||
item.set_name(theName.string)
|
||||
if theName:
|
||||
theName.extract()
|
||||
for t in tagsToGet:
|
||||
if t in item.data:
|
||||
continue
|
||||
dataTag = tag.find(t, recursive=False)
|
||||
if dataTag:
|
||||
item.data[tagToKey(dataTag)] = _valueWithType(dataTag,
|
||||
dataTag.string)
|
||||
if tag.notes:
|
||||
item.notes = tag.notes.string
|
||||
tag.notes.extract()
|
||||
episodeOf = tag.find('episode-of', recursive=False)
|
||||
if episodeOf:
|
||||
item.data['episode of'] = parseTags(episodeOf, _topLevel=False,
|
||||
_as=_as, _infoset2keys=_infoset2keys,
|
||||
_key2infoset=_key2infoset)
|
||||
episodeOf.extract()
|
||||
cRole = tag.find('current-role', recursive=False)
|
||||
if cRole:
|
||||
cr = parseTags(cRole, _topLevel=False, _as=_as,
|
||||
_infoset2keys=_infoset2keys, _key2infoset=_key2infoset)
|
||||
item.currentRole = cr
|
||||
cRole.extract()
|
||||
# XXX: big assumption, here. What about Movie instances used
|
||||
# as keys in dictionaries? What about other keys (season and
|
||||
# episode number, for example?)
|
||||
if not _topLevel:
|
||||
#tag.extract()
|
||||
return item
|
||||
_adder = lambda key, value: item.data.update({key: value})
|
||||
elif tagStr:
|
||||
if tag.notes:
|
||||
notes = (tag.notes.string or u'').strip()
|
||||
if notes:
|
||||
tagStr += u'::%s' % notes
|
||||
else:
|
||||
tagStr = _valueWithType(tag, tagStr)
|
||||
return tagStr
|
||||
elif firstChild:
|
||||
firstChildName = tagToKey(firstChild)
|
||||
if firstChildName in _TAGS_TO_LIST:
|
||||
item = []
|
||||
_adder = lambda key, value: item.append(value)
|
||||
else:
|
||||
item = {}
|
||||
_adder = lambda key, value: item.update({key: value})
|
||||
else:
|
||||
item = {}
|
||||
_adder = lambda key, value: item.update({name: value})
|
||||
for subTag in tag(recursive=False):
|
||||
subTagKey = tagToKey(subTag)
|
||||
# Exclude dinamically generated keys.
|
||||
if tag.name in _MAP_TOP_OBJ and subTagKey in item._additional_keys():
|
||||
continue
|
||||
subItem = parseTags(subTag, _topLevel=False, _as=_as,
|
||||
_infoset2keys=_infoset2keys, _key2infoset=_key2infoset)
|
||||
if subItem:
|
||||
_adder(subTagKey, subItem)
|
||||
if _topLevel and name in _MAP_TOP_OBJ:
|
||||
# Add information about 'info sets', but only to the top-level object.
|
||||
item.infoset2keys = _infoset2keys
|
||||
item.key2infoset = _key2infoset
|
||||
item.current_info = _infoset2keys.keys()
|
||||
return item
|
||||
|
||||
|
||||
def parseXML(xml):
|
||||
"""Parse a XML string, returning an appropriate object (usually an
|
||||
instance of a subclass of _Container."""
|
||||
xmlObj = BeautifulSoup.BeautifulStoneSoup(xml,
|
||||
convertEntities=BeautifulSoup.BeautifulStoneSoup.XHTML_ENTITIES)
|
||||
if xmlObj:
|
||||
mainTag = xmlObj.find()
|
||||
if mainTag:
|
||||
return parseTags(mainTag)
|
||||
return None
|
||||
|
||||
|
||||
@@ -1,29 +0,0 @@
|
||||
"""
|
||||
locale package (imdb package).
|
||||
|
||||
This package provides scripts and files for internationalization
|
||||
of IMDbPY.
|
||||
|
||||
Copyright 2009 H. Turgut Uyar <uyar@tekir.org>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
"""
|
||||
|
||||
import gettext
|
||||
import os
|
||||
|
||||
LOCALE_DIR = os.path.dirname(__file__)
|
||||
|
||||
gettext.bindtextdomain('imdbpy', LOCALE_DIR)
|
||||
@@ -1,78 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
generatepot.py script.
|
||||
|
||||
This script generates the imdbpy.pot file, from the DTD.
|
||||
|
||||
Copyright 2009 H. Turgut Uyar <uyar@tekir.org>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
|
||||
from datetime import datetime as dt
|
||||
|
||||
DEFAULT_MESSAGES = { }
|
||||
|
||||
ELEMENT_PATTERN = r"""<!ELEMENT\s+([^\s]+)"""
|
||||
re_element = re.compile(ELEMENT_PATTERN)
|
||||
|
||||
POT_HEADER_TEMPLATE = r"""# Gettext message file for imdbpy
|
||||
msgid ""
|
||||
msgstr ""
|
||||
"Project-Id-Version: imdbpy\n"
|
||||
"POT-Creation-Date: %(now)s\n"
|
||||
"PO-Revision-Date: YYYY-MM-DD HH:MM+0000\n"
|
||||
"Last-Translator: YOUR NAME <YOUR@EMAIL>\n"
|
||||
"Language-Team: TEAM NAME <TEAM@EMAIL>\n"
|
||||
"MIME-Version: 1.0\n"
|
||||
"Content-Type: text/plain; charset=UTF-8\n"
|
||||
"Content-Transfer-Encoding: 8bit\n"
|
||||
"Plural-Forms: nplurals=1; plural=0;\n"
|
||||
"Language-Code: en\n"
|
||||
"Language-Name: English\n"
|
||||
"Preferred-Encodings: utf-8\n"
|
||||
"Domain: imdbpy\n"
|
||||
"""
|
||||
|
||||
if len(sys.argv) != 2:
|
||||
print "Usage: %s dtd_file" % sys.argv[0]
|
||||
sys.exit()
|
||||
|
||||
dtdfilename = sys.argv[1]
|
||||
dtd = open(dtdfilename).read()
|
||||
elements = re_element.findall(dtd)
|
||||
uniq = set(elements)
|
||||
elements = list(uniq)
|
||||
|
||||
print POT_HEADER_TEMPLATE % {
|
||||
'now': dt.strftime(dt.now(), "%Y-%m-%d %H:%M+0000")
|
||||
}
|
||||
for element in sorted(elements):
|
||||
if element in DEFAULT_MESSAGES:
|
||||
print '# Default: %s' % DEFAULT_MESSAGES[element]
|
||||
else:
|
||||
print '# Default: %s' % element.replace('-', ' ').capitalize()
|
||||
print 'msgid "%s"' % element
|
||||
print 'msgstr ""'
|
||||
# use this part instead of the line above to generate the po file for English
|
||||
#if element in DEFAULT_MESSAGES:
|
||||
# print 'msgstr "%s"' % DEFAULT_MESSAGES[element]
|
||||
#else:
|
||||
# print 'msgstr "%s"' % element.replace('-', ' ').capitalize()
|
||||
print
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,204 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: iso-8859-1 -*-
|
||||
"""Generate binary message catalog from textual translation description.
|
||||
|
||||
This program converts a textual Uniforum-style message catalog (.po file) into
|
||||
a binary GNU catalog (.mo file). This is essentially the same function as the
|
||||
GNU msgfmt program, however, it is a simpler implementation.
|
||||
|
||||
Usage: msgfmt.py [OPTIONS] filename.po
|
||||
|
||||
Options:
|
||||
-o file
|
||||
--output-file=file
|
||||
Specify the output file to write to. If omitted, output will go to a
|
||||
file named filename.mo (based off the input file name).
|
||||
|
||||
-h
|
||||
--help
|
||||
Print this message and exit.
|
||||
|
||||
-V
|
||||
--version
|
||||
Display version information and exit.
|
||||
|
||||
Written by Martin v. Löwis <loewis@informatik.hu-berlin.de>,
|
||||
refactored / fixed by Thomas Waldmann <tw AT waldmann-edv DOT de>.
|
||||
"""
|
||||
|
||||
import sys, os
|
||||
import getopt, struct, array
|
||||
|
||||
__version__ = "1.3"
|
||||
|
||||
class SyntaxErrorException(Exception):
|
||||
"""raised when having trouble parsing the po file content"""
|
||||
pass
|
||||
|
||||
class MsgFmt(object):
|
||||
"""transform .po -> .mo format"""
|
||||
def __init__(self):
|
||||
self.messages = {}
|
||||
|
||||
def make_filenames(self, filename, outfile=None):
|
||||
"""Compute .mo name from .po name or language"""
|
||||
if filename.endswith('.po'):
|
||||
infile = filename
|
||||
else:
|
||||
infile = filename + '.po'
|
||||
if outfile is None:
|
||||
outfile = os.path.splitext(infile)[0] + '.mo'
|
||||
return infile, outfile
|
||||
|
||||
def add(self, id, str, fuzzy):
|
||||
"""Add a non-fuzzy translation to the dictionary."""
|
||||
if not fuzzy and str:
|
||||
self.messages[id] = str
|
||||
|
||||
def read_po(self, lines):
|
||||
ID = 1
|
||||
STR = 2
|
||||
section = None
|
||||
fuzzy = False
|
||||
line_no = 0
|
||||
msgid = msgstr = ''
|
||||
# Parse the catalog
|
||||
for line in lines:
|
||||
line_no += 1
|
||||
# If we get a comment line after a msgstr, this is a new entry
|
||||
if line.startswith('#') and section == STR:
|
||||
self.add(msgid, msgstr, fuzzy)
|
||||
section = None
|
||||
fuzzy = False
|
||||
# Record a fuzzy mark
|
||||
if line.startswith('#,') and 'fuzzy' in line:
|
||||
fuzzy = True
|
||||
# Skip comments
|
||||
if line.startswith('#'):
|
||||
continue
|
||||
# Now we are in a msgid section, output previous section
|
||||
if line.startswith('msgid'):
|
||||
if section == STR:
|
||||
self.add(msgid, msgstr, fuzzy)
|
||||
fuzzy = False
|
||||
section = ID
|
||||
line = line[5:]
|
||||
msgid = msgstr = ''
|
||||
# Now we are in a msgstr section
|
||||
elif line.startswith('msgstr'):
|
||||
section = STR
|
||||
line = line[6:]
|
||||
# Skip empty lines
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
# XXX: Does this always follow Python escape semantics?
|
||||
line = eval(line)
|
||||
if section == ID:
|
||||
msgid += line
|
||||
elif section == STR:
|
||||
msgstr += line
|
||||
else:
|
||||
raise SyntaxErrorException('Syntax error on line %d, before:\n%s' % (line_no, line))
|
||||
# Add last entry
|
||||
if section == STR:
|
||||
self.add(msgid, msgstr, fuzzy)
|
||||
|
||||
def generate_mo(self):
|
||||
"""Return the generated output."""
|
||||
keys = self.messages.keys()
|
||||
# the keys are sorted in the .mo file
|
||||
keys.sort()
|
||||
offsets = []
|
||||
ids = ''
|
||||
strs = ''
|
||||
for id in keys:
|
||||
# For each string, we need size and file offset. Each string is NUL
|
||||
# terminated; the NUL does not count into the size.
|
||||
offsets.append((len(ids), len(id), len(strs), len(self.messages[id])))
|
||||
ids += id + '\0'
|
||||
strs += self.messages[id] + '\0'
|
||||
output = []
|
||||
# The header is 7 32-bit unsigned integers. We don't use hash tables, so
|
||||
# the keys start right after the index tables.
|
||||
# translated string.
|
||||
keystart = 7*4 + 16*len(keys)
|
||||
# and the values start after the keys
|
||||
valuestart = keystart + len(ids)
|
||||
koffsets = []
|
||||
voffsets = []
|
||||
# The string table first has the list of keys, then the list of values.
|
||||
# Each entry has first the size of the string, then the file offset.
|
||||
for o1, l1, o2, l2 in offsets:
|
||||
koffsets += [l1, o1 + keystart]
|
||||
voffsets += [l2, o2 + valuestart]
|
||||
offsets = koffsets + voffsets
|
||||
output.append(struct.pack("Iiiiiii",
|
||||
0x950412deL, # Magic
|
||||
0, # Version
|
||||
len(keys), # # of entries
|
||||
7*4, # start of key index
|
||||
7*4 + len(keys)*8, # start of value index
|
||||
0, 0)) # size and offset of hash table
|
||||
output.append(array.array("i", offsets).tostring())
|
||||
output.append(ids)
|
||||
output.append(strs)
|
||||
return ''.join(output)
|
||||
|
||||
|
||||
def make(filename, outfile):
|
||||
mf = MsgFmt()
|
||||
infile, outfile = mf.make_filenames(filename, outfile)
|
||||
try:
|
||||
lines = file(infile).readlines()
|
||||
except IOError, msg:
|
||||
print >> sys.stderr, msg
|
||||
sys.exit(1)
|
||||
try:
|
||||
mf.read_po(lines)
|
||||
output = mf.generate_mo()
|
||||
except SyntaxErrorException, msg:
|
||||
print >> sys.stderr, msg
|
||||
|
||||
try:
|
||||
open(outfile, "wb").write(output)
|
||||
except IOError, msg:
|
||||
print >> sys.stderr, msg
|
||||
|
||||
|
||||
def usage(code, msg=''):
|
||||
print >> sys.stderr, __doc__
|
||||
if msg:
|
||||
print >> sys.stderr, msg
|
||||
sys.exit(code)
|
||||
|
||||
|
||||
def main():
|
||||
try:
|
||||
opts, args = getopt.getopt(sys.argv[1:], 'hVo:', ['help', 'version', 'output-file='])
|
||||
except getopt.error, msg:
|
||||
usage(1, msg)
|
||||
|
||||
outfile = None
|
||||
# parse options
|
||||
for opt, arg in opts:
|
||||
if opt in ('-h', '--help'):
|
||||
usage(0)
|
||||
elif opt in ('-V', '--version'):
|
||||
print >> sys.stderr, "msgfmt.py", __version__
|
||||
sys.exit(0)
|
||||
elif opt in ('-o', '--output-file'):
|
||||
outfile = arg
|
||||
# do it
|
||||
if not args:
|
||||
print >> sys.stderr, 'No input file given'
|
||||
print >> sys.stderr, "Try `msgfmt --help' for more information."
|
||||
return
|
||||
|
||||
for filename in args:
|
||||
make(filename, outfile)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
@@ -1,49 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
rebuildmo.py script.
|
||||
|
||||
This script builds the .mo files, from the .po files.
|
||||
|
||||
Copyright 2009 H. Turgut Uyar <uyar@tekir.org>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
"""
|
||||
|
||||
import glob
|
||||
import msgfmt
|
||||
import os
|
||||
|
||||
#LOCALE_DIR = os.path.dirname(__file__)
|
||||
|
||||
def rebuildmo():
|
||||
lang_glob = 'imdbpy-*.po'
|
||||
created = []
|
||||
for input_file in glob.glob(lang_glob):
|
||||
lang = input_file[7:-3]
|
||||
if not os.path.exists(lang):
|
||||
os.mkdir(lang)
|
||||
mo_dir = os.path.join(lang, 'LC_MESSAGES')
|
||||
if not os.path.exists(mo_dir):
|
||||
os.mkdir(mo_dir)
|
||||
output_file = os.path.join(mo_dir, 'imdbpy.mo')
|
||||
msgfmt.make(input_file, output_file)
|
||||
created.append(lang)
|
||||
return created
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
languages = rebuildmo()
|
||||
print 'Created locale for: %s.' % ' '.join(languages)
|
||||
|
||||
@@ -1,28 +0,0 @@
|
||||
"""
|
||||
parser package (imdb package).
|
||||
|
||||
This package provides various parsers to access IMDb data (e.g.: a
|
||||
parser for the web/http interface, a parser for the SQL database
|
||||
interface, etc.).
|
||||
So far, the http/httpThin, mobile and sql parsers are implemented.
|
||||
|
||||
Copyright 2004-2009 Davide Alberani <da@erlug.linux.it>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
"""
|
||||
|
||||
__all__ = ['http', 'mobile', 'sql']
|
||||
|
||||
|
||||
@@ -1,775 +0,0 @@
|
||||
"""
|
||||
parser.http package (imdb package).
|
||||
|
||||
This package provides the IMDbHTTPAccessSystem class used to access
|
||||
IMDb's data through the web interface.
|
||||
the imdb.IMDb function will return an instance of this class when
|
||||
called with the 'accessSystem' argument set to "http" or "web"
|
||||
or "html" (this is the default).
|
||||
|
||||
Copyright 2004-2010 Davide Alberani <da@erlug.linux.it>
|
||||
2008 H. Turgut Uyar <uyar@tekir.org>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
"""
|
||||
|
||||
import sys
|
||||
import logging
|
||||
from urllib import FancyURLopener, quote_plus
|
||||
from codecs import lookup
|
||||
|
||||
from imdb import IMDbBase, imdbURL_movie_main, imdbURL_person_main, \
|
||||
imdbURL_character_main, imdbURL_company_main, \
|
||||
imdbURL_keyword_main, imdbURL_find, imdbURL_top250, \
|
||||
imdbURL_bottom100
|
||||
from imdb.utils import analyze_title
|
||||
from imdb._exceptions import IMDbDataAccessError, IMDbParserError
|
||||
|
||||
import searchMovieParser
|
||||
import searchPersonParser
|
||||
import searchCharacterParser
|
||||
import searchCompanyParser
|
||||
import searchKeywordParser
|
||||
import movieParser
|
||||
import personParser
|
||||
import characterParser
|
||||
import companyParser
|
||||
import topBottomParser
|
||||
|
||||
# Logger for miscellaneous functions.
|
||||
_aux_logger = logging.getLogger('imdbpy.parser.http.aux')
|
||||
|
||||
IN_GAE = False
|
||||
try:
|
||||
import google.appengine
|
||||
IN_GAE = True
|
||||
_aux_logger.info('IMDbPY is running in the Google App Engine environment')
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
class _ModuleProxy:
|
||||
"""A proxy to instantiate and access parsers."""
|
||||
def __init__(self, module, defaultKeys=None, oldParsers=False,
|
||||
useModule=None, fallBackToNew=False):
|
||||
"""Initialize a proxy for the given module; defaultKeys, if set,
|
||||
muste be a dictionary of values to set for instanced objects."""
|
||||
if oldParsers or fallBackToNew:
|
||||
_aux_logger.warn('The old set of parsers was removed; falling ' \
|
||||
'back to the new parsers.')
|
||||
self.useModule = useModule
|
||||
if defaultKeys is None:
|
||||
defaultKeys = {}
|
||||
self._defaultKeys = defaultKeys
|
||||
self._module = module
|
||||
|
||||
def __getattr__(self, name):
|
||||
"""Called only when no look-up is found."""
|
||||
_sm = self._module
|
||||
# Read the _OBJECTS dictionary to build the asked parser.
|
||||
if name in _sm._OBJECTS:
|
||||
_entry = _sm._OBJECTS[name]
|
||||
# Initialize the parser.
|
||||
kwds = {}
|
||||
if self.useModule:
|
||||
kwds = {'useModule': self.useModule}
|
||||
parserClass = _entry[0][0]
|
||||
obj = parserClass(**kwds)
|
||||
attrsToSet = self._defaultKeys.copy()
|
||||
attrsToSet.update(_entry[1] or {})
|
||||
# Set attribute to the object.
|
||||
for key in attrsToSet:
|
||||
setattr(obj, key, attrsToSet[key])
|
||||
setattr(self, name, obj)
|
||||
return obj
|
||||
return getattr(_sm, name)
|
||||
|
||||
|
||||
PY_VERSION = sys.version_info[:2]
|
||||
|
||||
|
||||
# The cookies for the "adult" search.
|
||||
# Please don't mess with these account.
|
||||
# Old 'IMDbPY' account.
|
||||
_old_cookie_id = 'boM2bYxz9MCsOnH9gZ0S9QHs12NWrNdApxsls1Vb5/NGrNdjcHx3dUas10UASoAjVEvhAbGagERgOpNkAPvxdbfKwaV2ikEj9SzXY1WPxABmDKQwdqzwRbM+12NSeJFGUEx3F8as10WwidLzVshDtxaPIbP13NdjVS9UZTYqgTVGrNcT9vyXU1'
|
||||
_old_cookie_uu = '3M3AXsquTU5Gur/Svik+ewflPm5Rk2ieY3BIPlLjyK3C0Dp9F8UoPgbTyKiGtZp4x1X+uAUGKD7BM2g+dVd8eqEzDErCoYvdcvGLvVLAen1y08hNQtALjVKAe+1hM8g9QbNonlG1/t4S82ieUsBbrSIQbq1yhV6tZ6ArvSbA7rgHc8n5AdReyAmDaJ5Wm/ee3VDoCnGj/LlBs2ieUZNorhHDKK5Q=='
|
||||
# New 'IMDbPYweb' account.
|
||||
_cookie_id = 'rH1jNAkjTlNXvHolvBVBsgaPICNZbNdjVjzFwzas9JRmusdjVoqBs/Hs12NR+1WFxEoR9bGKEDUg6sNlADqXwkas12N131Rwdb+UQNGKN8PWrNdjcdqBQVLq8mbGDHP3hqzxhbD692NQi9D0JjpBtRaPIbP1zNdjUOqENQYv1ADWrNcT9vyXU1'
|
||||
_cookie_uu = 'su4/m8cho4c6HP+W1qgq6wchOmhnF0w+lIWvHjRUPJ6nRA9sccEafjGADJ6hQGrMd4GKqLcz2X4z5+w+M4OIKnRn7FpENH7dxDQu3bQEHyx0ZEyeRFTPHfQEX03XF+yeN1dsPpcXaqjUZAw+lGRfXRQEfz3RIX9IgVEffdBAHw2wQXyf9xdMPrQELw0QNB8dsffsqcdQemjPB0w+moLcPh0JrKrHJ9hjBzdMPpcXTH7XRwwOk='
|
||||
|
||||
# imdbpy2010 account.
|
||||
#_cookie_id = 'QrCdxVi+L+WgqOLrQJJgBgRRXGInphxiBPU/YXSFDyExMFzCp6YcYgSVXyEUhS/xMID8wqemHGID4DlntwZ49vemP5UXsAxiJ4D6goSmHGIgNT9hMXBaRSF2vMS3phxB0bVfQiQlP1RxdrzhB6YcRHFASyIhQVowwXCKtDSlD2YhgRvxBsCKtGemHBKH9mxSI='
|
||||
#_cookie_uu = 'oiEo2yoJFCA2Zbn/o7Z1LAPIwotAu6QdALv3foDb1x5F/tdrFY63XkSfty4kntS8Y8jkHSDLt3406+d+JThEilPI0mtTaOQdA/t2/iErp22jaLdeVU5ya4PIREpj7HFdpzhEHadcIAngSER50IoHDpD6Bz4Qy3b+UIhE/hBbhz5Q63ceA2hEvhPo5B0FnrL9Q8jkWjDIbA0Au3d+AOtnXoCIRL4Q28c+UOtnXpP4RL4T6OQdA+6ijUCI5B0AW2d+UOtnXpPYRL4T6OQdA8jkTUOYlC0A=='
|
||||
|
||||
|
||||
class _FakeURLOpener(object):
|
||||
"""Fake URLOpener object, used to return empty strings instead of
|
||||
errors.
|
||||
"""
|
||||
def __init__(self, url, headers):
|
||||
self.url = url
|
||||
self.headers = headers
|
||||
def read(self, *args, **kwds): return ''
|
||||
def close(self, *args, **kwds): pass
|
||||
def info(self, *args, **kwds): return self.headers
|
||||
|
||||
|
||||
class IMDbURLopener(FancyURLopener):
|
||||
"""Fetch web pages and handle errors."""
|
||||
_logger = logging.getLogger('imdbpy.parser.http.urlopener')
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self._last_url = u''
|
||||
FancyURLopener.__init__(self, *args, **kwargs)
|
||||
# Headers to add to every request.
|
||||
# XXX: IMDb's web server doesn't like urllib-based programs,
|
||||
# so lets fake to be Mozilla.
|
||||
# Wow! I'm shocked by my total lack of ethic! <g>
|
||||
for header in ('User-Agent', 'User-agent', 'user-agent'):
|
||||
self.del_header(header)
|
||||
self.set_header('User-Agent', 'Mozilla/5.0')
|
||||
# XXX: This class is used also to perform "Exact Primary
|
||||
# [Title|Name]" searches, and so by default the cookie is set.
|
||||
c_header = 'id=%s; uu=%s' % (_cookie_id, _cookie_uu)
|
||||
self.set_header('Cookie', c_header)
|
||||
|
||||
def get_proxy(self):
|
||||
"""Return the used proxy, or an empty string."""
|
||||
return self.proxies.get('http', '')
|
||||
|
||||
def set_proxy(self, proxy):
|
||||
"""Set the proxy."""
|
||||
if not proxy:
|
||||
if self.proxies.has_key('http'):
|
||||
del self.proxies['http']
|
||||
else:
|
||||
if not proxy.lower().startswith('http://'):
|
||||
proxy = 'http://%s' % proxy
|
||||
self.proxies['http'] = proxy
|
||||
|
||||
def set_header(self, header, value, _overwrite=True):
|
||||
"""Set a default header."""
|
||||
if _overwrite:
|
||||
self.del_header(header)
|
||||
self.addheaders.append((header, value))
|
||||
|
||||
def del_header(self, header):
|
||||
"""Remove a default header."""
|
||||
for index in xrange(len(self.addheaders)):
|
||||
if self.addheaders[index][0] == header:
|
||||
del self.addheaders[index]
|
||||
break
|
||||
|
||||
def retrieve_unicode(self, url, size=-1):
|
||||
"""Retrieves the given URL, and returns a unicode string,
|
||||
trying to guess the encoding of the data (assuming latin_1
|
||||
by default)"""
|
||||
encode = None
|
||||
try:
|
||||
if size != -1:
|
||||
self.set_header('Range', 'bytes=0-%d' % size)
|
||||
uopener = self.open(url)
|
||||
kwds = {}
|
||||
if PY_VERSION > (2, 3) and not IN_GAE:
|
||||
kwds['size'] = size
|
||||
content = uopener.read(**kwds)
|
||||
self._last_url = uopener.url
|
||||
# Maybe the server is so nice to tell us the charset...
|
||||
server_encode = uopener.info().getparam('charset')
|
||||
# Otherwise, look at the content-type HTML meta tag.
|
||||
if server_encode is None and content:
|
||||
first_bytes = content[:512]
|
||||
begin_h = first_bytes.find('text/html; charset=')
|
||||
if begin_h != -1:
|
||||
end_h = first_bytes[19+begin_h:].find('"')
|
||||
if end_h != -1:
|
||||
server_encode = first_bytes[19+begin_h:19+begin_h+end_h]
|
||||
if server_encode:
|
||||
try:
|
||||
if lookup(server_encode):
|
||||
encode = server_encode
|
||||
except (LookupError, ValueError, TypeError):
|
||||
pass
|
||||
uopener.close()
|
||||
if size != -1:
|
||||
self.del_header('Range')
|
||||
self.close()
|
||||
except IOError, e:
|
||||
if size != -1:
|
||||
# Ensure that the Range header is removed.
|
||||
self.del_header('Range')
|
||||
raise IMDbDataAccessError, {'errcode': e.errno,
|
||||
'errmsg': str(e.strerror),
|
||||
'url': url,
|
||||
'proxy': self.get_proxy(),
|
||||
'exception type': 'IOError',
|
||||
'original exception': e}
|
||||
if encode is None:
|
||||
encode = 'latin_1'
|
||||
# The detection of the encoding is error prone...
|
||||
self._logger.warn('Unable to detect the encoding of the retrieved '
|
||||
'page [%s]; falling back to default latin1.', encode)
|
||||
##print unicode(content, encode, 'replace').encode('utf8')
|
||||
return unicode(content, encode, 'replace')
|
||||
|
||||
def http_error_default(self, url, fp, errcode, errmsg, headers):
|
||||
if errcode == 404:
|
||||
self._logger.warn('404 code returned for %s: %s (headers: %s)',
|
||||
url, errmsg, headers)
|
||||
return _FakeURLOpener(url, headers)
|
||||
raise IMDbDataAccessError, {'url': 'http:%s' % url,
|
||||
'errcode': errcode,
|
||||
'errmsg': errmsg,
|
||||
'headers': headers,
|
||||
'error type': 'http_error_default',
|
||||
'proxy': self.get_proxy()}
|
||||
|
||||
def open_unknown(self, fullurl, data=None):
|
||||
raise IMDbDataAccessError, {'fullurl': fullurl,
|
||||
'data': str(data),
|
||||
'error type': 'open_unknown',
|
||||
'proxy': self.get_proxy()}
|
||||
|
||||
def open_unknown_proxy(self, proxy, fullurl, data=None):
|
||||
raise IMDbDataAccessError, {'proxy': str(proxy),
|
||||
'fullurl': fullurl,
|
||||
'error type': 'open_unknown_proxy',
|
||||
'data': str(data)}
|
||||
|
||||
|
||||
class IMDbHTTPAccessSystem(IMDbBase):
|
||||
"""The class used to access IMDb's data through the web."""
|
||||
|
||||
accessSystem = 'http'
|
||||
_http_logger = logging.getLogger('imdbpy.parser.http')
|
||||
|
||||
def __init__(self, isThin=0, adultSearch=1, proxy=-1, oldParsers=False,
|
||||
fallBackToNew=False, useModule=None, cookie_id=-1,
|
||||
cookie_uu=None, *arguments, **keywords):
|
||||
"""Initialize the access system."""
|
||||
IMDbBase.__init__(self, *arguments, **keywords)
|
||||
self.urlOpener = IMDbURLopener()
|
||||
# When isThin is set, we're parsing the "maindetails" page
|
||||
# of a movie (instead of the "combined" page) and movie/person
|
||||
# references are not collected if no defaultModFunct is provided.
|
||||
self.isThin = isThin
|
||||
self._getRefs = True
|
||||
self._mdparse = False
|
||||
if isThin:
|
||||
if self.accessSystem == 'http':
|
||||
self.accessSystem = 'httpThin'
|
||||
self._mdparse = True
|
||||
if self._defModFunct is None:
|
||||
self._getRefs = False
|
||||
from imdb.utils import modNull
|
||||
self._defModFunct = modNull
|
||||
self.do_adult_search(adultSearch)
|
||||
if cookie_id != -1:
|
||||
if cookie_id is None:
|
||||
self.del_cookies()
|
||||
elif cookie_uu is not None:
|
||||
self.set_cookies(cookie_id, cookie_uu)
|
||||
if proxy != -1:
|
||||
self.set_proxy(proxy)
|
||||
if useModule is not None:
|
||||
if not isinstance(useModule, (list, tuple)) and ',' in useModule:
|
||||
useModule = useModule.split(',')
|
||||
_def = {'_modFunct': self._defModFunct, '_as': self.accessSystem}
|
||||
# Proxy objects.
|
||||
self.smProxy = _ModuleProxy(searchMovieParser, defaultKeys=_def,
|
||||
oldParsers=oldParsers, useModule=useModule,
|
||||
fallBackToNew=fallBackToNew)
|
||||
self.spProxy = _ModuleProxy(searchPersonParser, defaultKeys=_def,
|
||||
oldParsers=oldParsers, useModule=useModule,
|
||||
fallBackToNew=fallBackToNew)
|
||||
self.scProxy = _ModuleProxy(searchCharacterParser, defaultKeys=_def,
|
||||
oldParsers=oldParsers, useModule=useModule,
|
||||
fallBackToNew=fallBackToNew)
|
||||
self.scompProxy = _ModuleProxy(searchCompanyParser, defaultKeys=_def,
|
||||
oldParsers=oldParsers, useModule=useModule,
|
||||
fallBackToNew=fallBackToNew)
|
||||
self.skProxy = _ModuleProxy(searchKeywordParser, defaultKeys=_def,
|
||||
oldParsers=oldParsers, useModule=useModule,
|
||||
fallBackToNew=fallBackToNew)
|
||||
self.mProxy = _ModuleProxy(movieParser, defaultKeys=_def,
|
||||
oldParsers=oldParsers, useModule=useModule,
|
||||
fallBackToNew=fallBackToNew)
|
||||
self.pProxy = _ModuleProxy(personParser, defaultKeys=_def,
|
||||
oldParsers=oldParsers, useModule=useModule,
|
||||
fallBackToNew=fallBackToNew)
|
||||
self.cProxy = _ModuleProxy(characterParser, defaultKeys=_def,
|
||||
oldParsers=oldParsers, useModule=useModule,
|
||||
fallBackToNew=fallBackToNew)
|
||||
self.compProxy = _ModuleProxy(companyParser, defaultKeys=_def,
|
||||
oldParsers=oldParsers, useModule=useModule,
|
||||
fallBackToNew=fallBackToNew)
|
||||
self.topBottomProxy = _ModuleProxy(topBottomParser, defaultKeys=_def,
|
||||
oldParsers=oldParsers, useModule=useModule,
|
||||
fallBackToNew=fallBackToNew)
|
||||
|
||||
def _normalize_movieID(self, movieID):
|
||||
"""Normalize the given movieID."""
|
||||
try:
|
||||
return '%07d' % int(movieID)
|
||||
except ValueError, e:
|
||||
raise IMDbParserError, 'invalid movieID "%s": %s' % (movieID, e)
|
||||
|
||||
def _normalize_personID(self, personID):
|
||||
"""Normalize the given personID."""
|
||||
try:
|
||||
return '%07d' % int(personID)
|
||||
except ValueError, e:
|
||||
raise IMDbParserError, 'invalid personID "%s": %s' % (personID, e)
|
||||
|
||||
def _normalize_characterID(self, characterID):
|
||||
"""Normalize the given characterID."""
|
||||
try:
|
||||
return '%07d' % int(characterID)
|
||||
except ValueError, e:
|
||||
raise IMDbParserError, 'invalid characterID "%s": %s' % \
|
||||
(characterID, e)
|
||||
|
||||
def _normalize_companyID(self, companyID):
|
||||
"""Normalize the given companyID."""
|
||||
try:
|
||||
return '%07d' % int(companyID)
|
||||
except ValueError, e:
|
||||
raise IMDbParserError, 'invalid companyID "%s": %s' % \
|
||||
(companyID, e)
|
||||
|
||||
def get_imdbMovieID(self, movieID):
|
||||
"""Translate a movieID in an imdbID; in this implementation
|
||||
the movieID _is_ the imdbID.
|
||||
"""
|
||||
return movieID
|
||||
|
||||
def get_imdbPersonID(self, personID):
|
||||
"""Translate a personID in an imdbID; in this implementation
|
||||
the personID _is_ the imdbID.
|
||||
"""
|
||||
return personID
|
||||
|
||||
def get_imdbCharacterID(self, characterID):
|
||||
"""Translate a characterID in an imdbID; in this implementation
|
||||
the characterID _is_ the imdbID.
|
||||
"""
|
||||
return characterID
|
||||
|
||||
def get_imdbCompanyID(self, companyID):
|
||||
"""Translate a companyID in an imdbID; in this implementation
|
||||
the companyID _is_ the imdbID.
|
||||
"""
|
||||
return companyID
|
||||
|
||||
def get_proxy(self):
|
||||
"""Return the used proxy or an empty string."""
|
||||
return self.urlOpener.get_proxy()
|
||||
|
||||
def set_proxy(self, proxy):
|
||||
"""Set the web proxy to use.
|
||||
|
||||
It should be a string like 'http://localhost:8080/'; if the
|
||||
string is empty, no proxy will be used.
|
||||
If set, the value of the environment variable HTTP_PROXY is
|
||||
automatically used.
|
||||
"""
|
||||
self.urlOpener.set_proxy(proxy)
|
||||
|
||||
def set_cookies(self, cookie_id, cookie_uu):
|
||||
"""Set a cookie to access an IMDb's account."""
|
||||
c_header = 'id=%s; uu=%s' % (cookie_id, cookie_uu)
|
||||
self.urlOpener.set_header('Cookie', c_header)
|
||||
|
||||
def del_cookies(self):
|
||||
"""Remove the used cookie."""
|
||||
self.urlOpener.del_header('Cookie')
|
||||
|
||||
def do_adult_search(self, doAdult,
|
||||
cookie_id=_cookie_id, cookie_uu=_cookie_uu):
|
||||
"""If doAdult is true, 'adult' movies are included in the
|
||||
search results; cookie_id and cookie_uu are optional
|
||||
parameters to select a specific account (see your cookie
|
||||
or cookies.txt file."""
|
||||
if doAdult:
|
||||
self.set_cookies(cookie_id, cookie_uu)
|
||||
#c_header = 'id=%s; uu=%s' % (cookie_id, cookie_uu)
|
||||
#self.urlOpener.set_header('Cookie', c_header)
|
||||
else:
|
||||
self.urlOpener.del_header('Cookie')
|
||||
|
||||
def _retrieve(self, url, size=-1):
|
||||
"""Retrieve the given URL."""
|
||||
##print url
|
||||
self._http_logger.debug('fetching url %s (size: %d)', url, size)
|
||||
return self.urlOpener.retrieve_unicode(url, size=size)
|
||||
|
||||
def _get_search_content(self, kind, ton, results):
|
||||
"""Retrieve the web page for a given search.
|
||||
kind can be 'tt' (for titles), 'nm' (for names),
|
||||
'char' (for characters) or 'co' (for companies).
|
||||
ton is the title or the name to search.
|
||||
results is the maximum number of results to be retrieved."""
|
||||
if isinstance(ton, unicode):
|
||||
ton = ton.encode('utf-8')
|
||||
##params = 'q=%s&%s=on&mx=%s' % (quote_plus(ton), kind, str(results))
|
||||
params = 's=%s;mx=%s;q=%s' % (kind, str(results), quote_plus(ton))
|
||||
if kind == 'ep':
|
||||
params = params.replace('s=ep;', 's=tt;ttype=ep;', 1)
|
||||
cont = self._retrieve(imdbURL_find % params)
|
||||
#print 'URL:', imdbURL_find % params
|
||||
if cont.find('Your search returned more than') == -1 or \
|
||||
cont.find("displayed the exact matches") == -1:
|
||||
return cont
|
||||
# The retrieved page contains no results, because too many
|
||||
# titles or names contain the string we're looking for.
|
||||
params = 's=%s;q=%s;lm=0' % (kind, quote_plus(ton))
|
||||
size = 22528 + results * 512
|
||||
return self._retrieve(imdbURL_find % params, size=size)
|
||||
|
||||
def _search_movie(self, title, results):
|
||||
# The URL of the query.
|
||||
# XXX: To retrieve the complete results list:
|
||||
# params = urllib.urlencode({'more': 'tt', 'q': title})
|
||||
##params = urllib.urlencode({'tt': 'on','mx': str(results),'q': title})
|
||||
##params = 'q=%s&tt=on&mx=%s' % (quote_plus(title), str(results))
|
||||
##cont = self._retrieve(imdbURL_find % params)
|
||||
cont = self._get_search_content('tt', title, results)
|
||||
return self.smProxy.search_movie_parser.parse(cont, results=results)['data']
|
||||
|
||||
def _search_episode(self, title, results):
|
||||
t_dict = analyze_title(title)
|
||||
if t_dict['kind'] == 'episode':
|
||||
title = t_dict['title']
|
||||
cont = self._get_search_content('ep', title, results)
|
||||
return self.smProxy.search_movie_parser.parse(cont, results=results)['data']
|
||||
|
||||
def get_movie_main(self, movieID):
|
||||
if not self.isThin:
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'combined')
|
||||
else:
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'maindetails')
|
||||
return self.mProxy.movie_parser.parse(cont, mdparse=self._mdparse)
|
||||
|
||||
def get_movie_full_credits(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'fullcredits')
|
||||
return self.mProxy.movie_parser.parse(cont)
|
||||
|
||||
def get_movie_plot(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'plotsummary')
|
||||
return self.mProxy.plot_parser.parse(cont, getRefs=self._getRefs)
|
||||
|
||||
def get_movie_awards(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'awards')
|
||||
return self.mProxy.movie_awards_parser.parse(cont)
|
||||
|
||||
def get_movie_taglines(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'taglines')
|
||||
return self.mProxy.taglines_parser.parse(cont)
|
||||
|
||||
def get_movie_keywords(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'keywords')
|
||||
return self.mProxy.keywords_parser.parse(cont)
|
||||
|
||||
def get_movie_alternate_versions(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'alternateversions')
|
||||
return self.mProxy.alternateversions_parser.parse(cont,
|
||||
getRefs=self._getRefs)
|
||||
|
||||
def get_movie_crazy_credits(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'crazycredits')
|
||||
return self.mProxy.crazycredits_parser.parse(cont,
|
||||
getRefs=self._getRefs)
|
||||
|
||||
def get_movie_goofs(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'goofs')
|
||||
return self.mProxy.goofs_parser.parse(cont, getRefs=self._getRefs)
|
||||
|
||||
def get_movie_quotes(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'quotes')
|
||||
return self.mProxy.quotes_parser.parse(cont, getRefs=self._getRefs)
|
||||
|
||||
def get_movie_release_dates(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'releaseinfo')
|
||||
ret = self.mProxy.releasedates_parser.parse(cont)
|
||||
ret['info sets'] = ('release dates', 'akas')
|
||||
return ret
|
||||
get_movie_akas = get_movie_release_dates
|
||||
|
||||
def get_movie_vote_details(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'ratings')
|
||||
return self.mProxy.ratings_parser.parse(cont)
|
||||
|
||||
def get_movie_official_sites(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'officialsites')
|
||||
return self.mProxy.officialsites_parser.parse(cont)
|
||||
|
||||
def get_movie_trivia(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'trivia')
|
||||
return self.mProxy.trivia_parser.parse(cont, getRefs=self._getRefs)
|
||||
|
||||
def get_movie_connections(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'movieconnections')
|
||||
return self.mProxy.connections_parser.parse(cont)
|
||||
|
||||
def get_movie_technical(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'technical')
|
||||
return self.mProxy.tech_parser.parse(cont)
|
||||
|
||||
def get_movie_business(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'business')
|
||||
return self.mProxy.business_parser.parse(cont, getRefs=self._getRefs)
|
||||
|
||||
def get_movie_literature(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'literature')
|
||||
return self.mProxy.literature_parser.parse(cont)
|
||||
|
||||
def get_movie_locations(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'locations')
|
||||
return self.mProxy.locations_parser.parse(cont)
|
||||
|
||||
def get_movie_soundtrack(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'soundtrack')
|
||||
return self.mProxy.soundtrack_parser.parse(cont)
|
||||
|
||||
def get_movie_dvd(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'dvd')
|
||||
return self.mProxy.dvd_parser.parse(cont, getRefs=self._getRefs)
|
||||
|
||||
def get_movie_recommendations(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'recommendations')
|
||||
return self.mProxy.rec_parser.parse(cont)
|
||||
|
||||
def get_movie_external_reviews(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'externalreviews')
|
||||
return self.mProxy.externalrev_parser.parse(cont)
|
||||
|
||||
def get_movie_newsgroup_reviews(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'newsgroupreviews')
|
||||
return self.mProxy.newsgrouprev_parser.parse(cont)
|
||||
|
||||
def get_movie_misc_sites(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'miscsites')
|
||||
return self.mProxy.misclinks_parser.parse(cont)
|
||||
|
||||
def get_movie_sound_clips(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'soundsites')
|
||||
return self.mProxy.soundclips_parser.parse(cont)
|
||||
|
||||
def get_movie_video_clips(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'videosites')
|
||||
return self.mProxy.videoclips_parser.parse(cont)
|
||||
|
||||
def get_movie_photo_sites(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'photosites')
|
||||
return self.mProxy.photosites_parser.parse(cont)
|
||||
|
||||
def get_movie_news(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'news')
|
||||
return self.mProxy.news_parser.parse(cont, getRefs=self._getRefs)
|
||||
|
||||
def get_movie_amazon_reviews(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'amazon')
|
||||
return self.mProxy.amazonrev_parser.parse(cont)
|
||||
|
||||
def get_movie_guests(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'epcast')
|
||||
return self.mProxy.episodes_cast_parser.parse(cont)
|
||||
get_movie_episodes_cast = get_movie_guests
|
||||
|
||||
def get_movie_merchandising_links(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'sales')
|
||||
return self.mProxy.sales_parser.parse(cont)
|
||||
|
||||
def get_movie_episodes(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'episodes')
|
||||
data_d = self.mProxy.episodes_parser.parse(cont)
|
||||
# set movie['episode of'].movieID for every episode of the series.
|
||||
if data_d.get('data', {}).has_key('episodes'):
|
||||
nr_eps = 0
|
||||
for season in data_d['data']['episodes'].values():
|
||||
for episode in season.values():
|
||||
episode['episode of'].movieID = movieID
|
||||
nr_eps += 1
|
||||
# Number of episodes.
|
||||
if nr_eps:
|
||||
data_d['data']['number of episodes'] = nr_eps
|
||||
return data_d
|
||||
|
||||
def get_movie_episodes_rating(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'epdate')
|
||||
data_d = self.mProxy.eprating_parser.parse(cont)
|
||||
# set movie['episode of'].movieID for every episode.
|
||||
if data_d.get('data', {}).has_key('episodes rating'):
|
||||
for item in data_d['data']['episodes rating']:
|
||||
episode = item['episode']
|
||||
episode['episode of'].movieID = movieID
|
||||
return data_d
|
||||
|
||||
def get_movie_faqs(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'faq')
|
||||
return self.mProxy.movie_faqs_parser.parse(cont, getRefs=self._getRefs)
|
||||
|
||||
def get_movie_airing(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'tvschedule')
|
||||
return self.mProxy.airing_parser.parse(cont)
|
||||
|
||||
get_movie_tv_schedule = get_movie_airing
|
||||
|
||||
def get_movie_synopsis(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'synopsis')
|
||||
return self.mProxy.synopsis_parser.parse(cont)
|
||||
|
||||
def get_movie_parents_guide(self, movieID):
|
||||
cont = self._retrieve(imdbURL_movie_main % movieID + 'parentalguide')
|
||||
return self.mProxy.parentsguide_parser.parse(cont)
|
||||
|
||||
def _search_person(self, name, results):
|
||||
# The URL of the query.
|
||||
# XXX: To retrieve the complete results list:
|
||||
# params = urllib.urlencode({'more': 'nm', 'q': name})
|
||||
##params = urllib.urlencode({'nm': 'on', 'mx': str(results), 'q': name})
|
||||
#params = 'q=%s&nm=on&mx=%s' % (quote_plus(name), str(results))
|
||||
#cont = self._retrieve(imdbURL_find % params)
|
||||
cont = self._get_search_content('nm', name, results)
|
||||
return self.spProxy.search_person_parser.parse(cont, results=results)['data']
|
||||
|
||||
def get_person_main(self, personID):
|
||||
cont = self._retrieve(imdbURL_person_main % personID + 'maindetails')
|
||||
ret = self.pProxy.maindetails_parser.parse(cont)
|
||||
ret['info sets'] = ('main', 'filmography')
|
||||
return ret
|
||||
|
||||
def get_person_filmography(self, personID):
|
||||
return self.get_person_main(personID)
|
||||
|
||||
def get_person_biography(self, personID):
|
||||
cont = self._retrieve(imdbURL_person_main % personID + 'bio')
|
||||
return self.pProxy.bio_parser.parse(cont, getRefs=self._getRefs)
|
||||
|
||||
def get_person_awards(self, personID):
|
||||
cont = self._retrieve(imdbURL_person_main % personID + 'awards')
|
||||
return self.pProxy.person_awards_parser.parse(cont)
|
||||
|
||||
def get_person_other_works(self, personID):
|
||||
cont = self._retrieve(imdbURL_person_main % personID + 'otherworks')
|
||||
return self.pProxy.otherworks_parser.parse(cont, getRefs=self._getRefs)
|
||||
|
||||
#def get_person_agent(self, personID):
|
||||
# cont = self._retrieve(imdbURL_person_main % personID + 'agent')
|
||||
# return self.pProxy.agent_parser.parse(cont)
|
||||
|
||||
def get_person_publicity(self, personID):
|
||||
cont = self._retrieve(imdbURL_person_main % personID + 'publicity')
|
||||
return self.pProxy.publicity_parser.parse(cont)
|
||||
|
||||
def get_person_official_sites(self, personID):
|
||||
cont = self._retrieve(imdbURL_person_main % personID + 'officialsites')
|
||||
return self.pProxy.person_officialsites_parser.parse(cont)
|
||||
|
||||
def get_person_news(self, personID):
|
||||
cont = self._retrieve(imdbURL_person_main % personID + 'news')
|
||||
return self.pProxy.news_parser.parse(cont)
|
||||
|
||||
def get_person_episodes(self, personID):
|
||||
cont = self._retrieve(imdbURL_person_main % personID + 'filmoseries')
|
||||
return self.pProxy.person_series_parser.parse(cont)
|
||||
|
||||
def get_person_merchandising_links(self, personID):
|
||||
cont = self._retrieve(imdbURL_person_main % personID + 'forsale')
|
||||
return self.pProxy.sales_parser.parse(cont)
|
||||
|
||||
def get_person_genres_links(self, personID):
|
||||
cont = self._retrieve(imdbURL_person_main % personID + 'filmogenre')
|
||||
return self.pProxy.person_genres_parser.parse(cont)
|
||||
|
||||
def get_person_keywords_links(self, personID):
|
||||
cont = self._retrieve(imdbURL_person_main % personID + 'filmokey')
|
||||
return self.pProxy.person_keywords_parser.parse(cont)
|
||||
|
||||
def _search_character(self, name, results):
|
||||
cont = self._get_search_content('char', name, results)
|
||||
return self.scProxy.search_character_parser.parse(cont, results=results)['data']
|
||||
|
||||
def get_character_main(self, characterID):
|
||||
cont = self._retrieve(imdbURL_character_main % characterID)
|
||||
ret = self.cProxy.character_main_parser.parse(cont)
|
||||
ret['info sets'] = ('main', 'filmography')
|
||||
return ret
|
||||
|
||||
get_character_filmography = get_character_main
|
||||
|
||||
def get_character_biography(self, characterID):
|
||||
cont = self._retrieve(imdbURL_character_main % characterID + 'bio')
|
||||
return self.cProxy.character_bio_parser.parse(cont,
|
||||
getRefs=self._getRefs)
|
||||
|
||||
def get_character_episodes(self, characterID):
|
||||
cont = self._retrieve(imdbURL_character_main % characterID +
|
||||
'filmoseries')
|
||||
return self.cProxy.character_series_parser.parse(cont)
|
||||
|
||||
def get_character_quotes(self, characterID):
|
||||
cont = self._retrieve(imdbURL_character_main % characterID + 'quotes')
|
||||
return self.cProxy.character_quotes_parser.parse(cont,
|
||||
getRefs=self._getRefs)
|
||||
|
||||
def _search_company(self, name, results):
|
||||
cont = self._get_search_content('co', name, results)
|
||||
url = self.urlOpener._last_url
|
||||
return self.scompProxy.search_company_parser.parse(cont, url=url,
|
||||
results=results)['data']
|
||||
|
||||
def get_company_main(self, companyID):
|
||||
cont = self._retrieve(imdbURL_company_main % companyID)
|
||||
ret = self.compProxy.company_main_parser.parse(cont)
|
||||
return ret
|
||||
|
||||
def _search_keyword(self, keyword, results):
|
||||
# XXX: the IMDb web server seems to have some serious problem with
|
||||
# non-ascii keyword.
|
||||
# E.g.: http://akas.imdb.com/keyword/fianc%E9/
|
||||
# will return a 500 Internal Server Error: Redirect Recursion.
|
||||
keyword = keyword.encode('utf8', 'ignore')
|
||||
try:
|
||||
cont = self._get_search_content('kw', keyword, results)
|
||||
except IMDbDataAccessError:
|
||||
self._http_logger.warn('unable to search for keyword %s', keyword,
|
||||
exc_info=True)
|
||||
return []
|
||||
return self.skProxy.search_keyword_parser.parse(cont, results=results)['data']
|
||||
|
||||
def _get_keyword(self, keyword, results):
|
||||
keyword = keyword.encode('utf8', 'ignore')
|
||||
try:
|
||||
cont = self._retrieve(imdbURL_keyword_main % keyword)
|
||||
except IMDbDataAccessError:
|
||||
self._http_logger.warn('unable to get keyword %s', keyword,
|
||||
exc_info=True)
|
||||
return []
|
||||
return self.skProxy.search_moviekeyword_parser.parse(cont, results=results)['data']
|
||||
|
||||
def _get_top_bottom_movies(self, kind):
|
||||
if kind == 'top':
|
||||
parser = self.topBottomProxy.top250_parser
|
||||
url = imdbURL_top250
|
||||
elif kind == 'bottom':
|
||||
parser = self.topBottomProxy.bottom100_parser
|
||||
url = imdbURL_bottom100
|
||||
else:
|
||||
return []
|
||||
cont = self._retrieve(url)
|
||||
return parser.parse(cont)['data']
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,394 +0,0 @@
|
||||
"""
|
||||
parser.http.bsoupxpath module (imdb.parser.http package).
|
||||
|
||||
This module provides XPath support for BeautifulSoup.
|
||||
|
||||
Copyright 2008 H. Turgut Uyar <uyar@tekir.org>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
"""
|
||||
|
||||
__author__ = 'H. Turgut Uyar <uyar@tekir.org>'
|
||||
__docformat__ = 'restructuredtext'
|
||||
|
||||
|
||||
import re
|
||||
import string
|
||||
import _bsoup as BeautifulSoup
|
||||
|
||||
|
||||
# XPath related enumerations and constants
|
||||
|
||||
AXIS_ANCESTOR = 'ancestor'
|
||||
AXIS_ATTRIBUTE = 'attribute'
|
||||
AXIS_CHILD = 'child'
|
||||
AXIS_DESCENDANT = 'descendant'
|
||||
AXIS_FOLLOWING = 'following'
|
||||
AXIS_FOLLOWING_SIBLING = 'following-sibling'
|
||||
AXIS_PRECEDING_SIBLING = 'preceding-sibling'
|
||||
|
||||
AXES = (AXIS_ANCESTOR, AXIS_ATTRIBUTE, AXIS_CHILD, AXIS_DESCENDANT,
|
||||
AXIS_FOLLOWING, AXIS_FOLLOWING_SIBLING, AXIS_PRECEDING_SIBLING)
|
||||
|
||||
XPATH_FUNCTIONS = ('starts-with', 'string-length')
|
||||
|
||||
|
||||
def tokenize_path(path):
|
||||
"""Tokenize a location path into location steps. Return the list of steps.
|
||||
|
||||
If two steps are separated by a double slash, the double slashes are part of
|
||||
the second step. If they are separated by only one slash, the slash is not
|
||||
included in any of the steps.
|
||||
"""
|
||||
# form a list of tuples that mark the start and end positions of steps
|
||||
separators = []
|
||||
last_position = 0
|
||||
i = -1
|
||||
in_string = False
|
||||
while i < len(path) - 1:
|
||||
i = i + 1
|
||||
if path[i] == "'":
|
||||
in_string = not in_string
|
||||
if in_string:
|
||||
# slashes within strings are not step separators
|
||||
continue
|
||||
if path[i] == '/':
|
||||
if i > 0:
|
||||
separators.append((last_position, i))
|
||||
if (path[i+1] == '/'):
|
||||
last_position = i
|
||||
i = i + 1
|
||||
else:
|
||||
last_position = i + 1
|
||||
separators.append((last_position, len(path)))
|
||||
|
||||
steps = []
|
||||
for start, end in separators:
|
||||
steps.append(path[start:end])
|
||||
return steps
|
||||
|
||||
|
||||
class Path:
|
||||
"""A location path.
|
||||
"""
|
||||
|
||||
def __init__(self, path, parse=True):
|
||||
self.path = path
|
||||
self.steps = []
|
||||
if parse:
|
||||
if (path[0] == '/') and (path[1] != '/'):
|
||||
# if not on the descendant axis, remove the leading slash
|
||||
path = path[1:]
|
||||
steps = tokenize_path(path)
|
||||
for step in steps:
|
||||
self.steps.append(PathStep(step))
|
||||
|
||||
def apply(self, node):
|
||||
"""Apply the path to a node. Return the resulting list of nodes.
|
||||
|
||||
Apply the steps in the path sequentially by sending the output of each
|
||||
step as input to the next step.
|
||||
"""
|
||||
# FIXME: this should return a node SET, not a node LIST
|
||||
# or at least a list with no duplicates
|
||||
if self.path[0] == '/':
|
||||
# for an absolute path, start from the root
|
||||
if not isinstance(node, BeautifulSoup.Tag) \
|
||||
or (node.name != '[document]'):
|
||||
node = node.findParent('[document]')
|
||||
nodes = [node]
|
||||
for step in self.steps:
|
||||
nodes = step.apply(nodes)
|
||||
return nodes
|
||||
|
||||
|
||||
class PathStep:
|
||||
"""A location step in a location path.
|
||||
"""
|
||||
|
||||
AXIS_PATTERN = r"""(%s)::|@""" % '|'.join(AXES)
|
||||
NODE_TEST_PATTERN = r"""\w+(\(\))?"""
|
||||
PREDICATE_PATTERN = r"""\[(.*?)\]"""
|
||||
LOCATION_STEP_PATTERN = r"""(%s)?(%s)((%s)*)""" \
|
||||
% (AXIS_PATTERN, NODE_TEST_PATTERN, PREDICATE_PATTERN)
|
||||
|
||||
_re_location_step = re.compile(LOCATION_STEP_PATTERN)
|
||||
|
||||
PREDICATE_NOT_PATTERN = r"""not\((.*?)\)"""
|
||||
PREDICATE_AXIS_PATTERN = r"""(%s)?(%s)(='(.*?)')?""" \
|
||||
% (AXIS_PATTERN, NODE_TEST_PATTERN)
|
||||
PREDICATE_FUNCTION_PATTERN = r"""(%s)\(([^,]+(,\s*[^,]+)*)?\)(=(.*))?""" \
|
||||
% '|'.join(XPATH_FUNCTIONS)
|
||||
|
||||
_re_predicate_not = re.compile(PREDICATE_NOT_PATTERN)
|
||||
_re_predicate_axis = re.compile(PREDICATE_AXIS_PATTERN)
|
||||
_re_predicate_function = re.compile(PREDICATE_FUNCTION_PATTERN)
|
||||
|
||||
def __init__(self, step):
|
||||
self.step = step
|
||||
if (step == '.') or (step == '..'):
|
||||
return
|
||||
|
||||
if step[:2] == '//':
|
||||
default_axis = AXIS_DESCENDANT
|
||||
step = step[2:]
|
||||
else:
|
||||
default_axis = AXIS_CHILD
|
||||
|
||||
step_match = self._re_location_step.match(step)
|
||||
|
||||
# determine the axis
|
||||
axis = step_match.group(1)
|
||||
if axis is None:
|
||||
self.axis = default_axis
|
||||
elif axis == '@':
|
||||
self.axis = AXIS_ATTRIBUTE
|
||||
else:
|
||||
self.axis = step_match.group(2)
|
||||
|
||||
self.soup_args = {}
|
||||
self.index = None
|
||||
|
||||
self.node_test = step_match.group(3)
|
||||
if self.node_test == 'text()':
|
||||
self.soup_args['text'] = True
|
||||
else:
|
||||
self.soup_args['name'] = self.node_test
|
||||
|
||||
self.checkers = []
|
||||
predicates = step_match.group(5)
|
||||
if predicates is not None:
|
||||
predicates = [p for p in predicates[1:-1].split('][') if p]
|
||||
for predicate in predicates:
|
||||
checker = self.__parse_predicate(predicate)
|
||||
if checker is not None:
|
||||
self.checkers.append(checker)
|
||||
|
||||
def __parse_predicate(self, predicate):
|
||||
"""Parse the predicate. Return a callable that can be used to filter
|
||||
nodes. Update `self.soup_args` to take advantage of BeautifulSoup search
|
||||
features.
|
||||
"""
|
||||
try:
|
||||
position = int(predicate)
|
||||
if self.axis == AXIS_DESCENDANT:
|
||||
return PredicateFilter('position', value=position)
|
||||
else:
|
||||
# use the search limit feature instead of a checker
|
||||
self.soup_args['limit'] = position
|
||||
self.index = position - 1
|
||||
return None
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if predicate == "last()":
|
||||
self.index = -1
|
||||
return None
|
||||
|
||||
negate = self._re_predicate_not.match(predicate)
|
||||
if negate:
|
||||
predicate = negate.group(1)
|
||||
|
||||
function_match = self._re_predicate_function.match(predicate)
|
||||
if function_match:
|
||||
name = function_match.group(1)
|
||||
arguments = function_match.group(2)
|
||||
value = function_match.group(4)
|
||||
if value is not None:
|
||||
value = function_match.group(5)
|
||||
return PredicateFilter(name, arguments, value)
|
||||
|
||||
axis_match = self._re_predicate_axis.match(predicate)
|
||||
if axis_match:
|
||||
axis = axis_match.group(1)
|
||||
if axis is None:
|
||||
axis = AXIS_CHILD
|
||||
elif axis == '@':
|
||||
axis = AXIS_ATTRIBUTE
|
||||
if axis == AXIS_ATTRIBUTE:
|
||||
# use the attribute search feature instead of a checker
|
||||
attribute_name = axis_match.group(3)
|
||||
if axis_match.group(5) is not None:
|
||||
attribute_value = axis_match.group(6)
|
||||
elif not negate:
|
||||
attribute_value = True
|
||||
else:
|
||||
attribute_value = None
|
||||
if not self.soup_args.has_key('attrs'):
|
||||
self.soup_args['attrs'] = {}
|
||||
self.soup_args['attrs'][attribute_name] = attribute_value
|
||||
return None
|
||||
elif axis == AXIS_CHILD:
|
||||
node_test = axis_match.group(3)
|
||||
node_value = axis_match.group(6)
|
||||
return PredicateFilter('axis', node_test, value=node_value,
|
||||
negate=negate)
|
||||
|
||||
raise NotImplementedError("This predicate is not implemented")
|
||||
|
||||
def apply(self, nodes):
|
||||
"""Apply the step to a list of nodes. Return the list of nodes for the
|
||||
next step.
|
||||
"""
|
||||
if self.step == '.':
|
||||
return nodes
|
||||
elif self.step == '..':
|
||||
return [node.parent for node in nodes]
|
||||
|
||||
result = []
|
||||
for node in nodes:
|
||||
if self.axis == AXIS_CHILD:
|
||||
found = node.findAll(recursive=False, **self.soup_args)
|
||||
elif self.axis == AXIS_DESCENDANT:
|
||||
found = node.findAll(recursive=True, **self.soup_args)
|
||||
elif self.axis == AXIS_ATTRIBUTE:
|
||||
try:
|
||||
found = [node[self.node_test]]
|
||||
except KeyError:
|
||||
found = []
|
||||
elif self.axis == AXIS_FOLLOWING_SIBLING:
|
||||
found = node.findNextSiblings(**self.soup_args)
|
||||
elif self.axis == AXIS_PRECEDING_SIBLING:
|
||||
# TODO: make sure that the result is reverse ordered
|
||||
found = node.findPreviousSiblings(**self.soup_args)
|
||||
elif self.axis == AXIS_FOLLOWING:
|
||||
# find the last descendant of this node
|
||||
last = node
|
||||
while (not isinstance(last, BeautifulSoup.NavigableString)) \
|
||||
and (len(last.contents) > 0):
|
||||
last = last.contents[-1]
|
||||
found = last.findAllNext(**self.soup_args)
|
||||
elif self.axis == AXIS_ANCESTOR:
|
||||
found = node.findParents(**self.soup_args)
|
||||
|
||||
# this should only be active if there is a position predicate
|
||||
# and the axis is not 'descendant'
|
||||
if self.index is not None:
|
||||
if found:
|
||||
if len(found) > self.index:
|
||||
found = [found[self.index]]
|
||||
else:
|
||||
found = []
|
||||
|
||||
if found:
|
||||
for checker in self.checkers:
|
||||
found = filter(checker, found)
|
||||
result.extend(found)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class PredicateFilter:
|
||||
"""A callable class for filtering nodes.
|
||||
"""
|
||||
|
||||
def __init__(self, name, arguments=None, value=None, negate=False):
|
||||
self.name = name
|
||||
self.arguments = arguments
|
||||
self.negate = negate
|
||||
|
||||
if name == 'position':
|
||||
self.__filter = self.__position
|
||||
self.value = value
|
||||
elif name == 'axis':
|
||||
self.__filter = self.__axis
|
||||
self.node_test = arguments
|
||||
self.value = value
|
||||
elif name == 'starts-with':
|
||||
self.__filter = self.__starts_with
|
||||
args = map(string.strip, arguments.split(','))
|
||||
if args[0][0] == '@':
|
||||
self.arguments = (True, args[0][1:], args[1][1:-1])
|
||||
else:
|
||||
self.arguments = (False, args[0], args[1][1:-1])
|
||||
elif name == 'string-length':
|
||||
self.__filter = self.__string_length
|
||||
args = map(string.strip, arguments.split(','))
|
||||
if args[0][0] == '@':
|
||||
self.arguments = (True, args[0][1:])
|
||||
else:
|
||||
self.arguments = (False, args[0])
|
||||
self.value = int(value)
|
||||
else:
|
||||
raise NotImplementedError("This XPath function is not implemented")
|
||||
|
||||
def __call__(self, node):
|
||||
if self.negate:
|
||||
return not self.__filter(node)
|
||||
else:
|
||||
return self.__filter(node)
|
||||
|
||||
def __position(self, node):
|
||||
if isinstance(node, BeautifulSoup.NavigableString):
|
||||
actual_position = len(node.findPreviousSiblings(text=True)) + 1
|
||||
else:
|
||||
actual_position = len(node.findPreviousSiblings(node.name)) + 1
|
||||
return actual_position == self.value
|
||||
|
||||
def __axis(self, node):
|
||||
if self.node_test == 'text()':
|
||||
return node.string == self.value
|
||||
else:
|
||||
children = node.findAll(self.node_test, recursive=False)
|
||||
if len(children) > 0 and self.value is None:
|
||||
return True
|
||||
for child in children:
|
||||
if child.string == self.value:
|
||||
return True
|
||||
return False
|
||||
|
||||
def __starts_with(self, node):
|
||||
if self.arguments[0]:
|
||||
# this is an attribute
|
||||
attribute_name = self.arguments[1]
|
||||
if node.has_key(attribute_name):
|
||||
first = node[attribute_name]
|
||||
return first.startswith(self.arguments[2])
|
||||
elif self.arguments[1] == 'text()':
|
||||
first = node.contents[0]
|
||||
if isinstance(first, BeautifulSoup.NavigableString):
|
||||
return first.startswith(self.arguments[2])
|
||||
return False
|
||||
|
||||
def __string_length(self, node):
|
||||
if self.arguments[0]:
|
||||
# this is an attribute
|
||||
attribute_name = self.arguments[1]
|
||||
if node.has_key(attribute_name):
|
||||
value = node[attribute_name]
|
||||
else:
|
||||
value = None
|
||||
elif self.arguments[1] == 'text()':
|
||||
value = node.string
|
||||
if value is not None:
|
||||
return len(value) == self.value
|
||||
return False
|
||||
|
||||
|
||||
_paths = {}
|
||||
_steps = {}
|
||||
|
||||
def get_path(path):
|
||||
"""Utility for eliminating repeated parsings of the same paths and steps.
|
||||
"""
|
||||
if not _paths.has_key(path):
|
||||
p = Path(path, parse=False)
|
||||
steps = tokenize_path(path)
|
||||
for step in steps:
|
||||
if not _steps.has_key(step):
|
||||
_steps[step] = PathStep(step)
|
||||
p.steps.append(_steps[step])
|
||||
_paths[path] = p
|
||||
return _paths[path]
|
||||
@@ -1,75 +0,0 @@
|
||||
"""
|
||||
parser.http.bsouplxml.etree module (imdb.parser.http package).
|
||||
|
||||
This module adapts the beautifulsoup interface to lxml.etree module.
|
||||
|
||||
Copyright 2008 H. Turgut Uyar <uyar@tekir.org>
|
||||
2008 Davide Alberani <da@erlug.linux.it>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
"""
|
||||
|
||||
import _bsoup as BeautifulSoup
|
||||
from _bsoup import Tag as Element
|
||||
|
||||
import bsoupxpath
|
||||
|
||||
# Not directly used by IMDbPY, but do not remove: it's used by IMDbPYKit,
|
||||
# for example.
|
||||
def fromstring(xml_string):
|
||||
"""Return a DOM representation of the string."""
|
||||
# We try to not use BeautifulSoup.BeautifulStoneSoup.XML_ENTITIES,
|
||||
# for convertEntities.
|
||||
return BeautifulSoup.BeautifulStoneSoup(xml_string,
|
||||
convertEntities=None).findChild(True)
|
||||
|
||||
|
||||
def tostring(element, encoding=None, pretty_print=False):
|
||||
"""Return a string or unicode representation of an element."""
|
||||
if encoding is unicode:
|
||||
encoding = None
|
||||
# For BeautifulSoup 3.1
|
||||
#encArgs = {'prettyPrint': pretty_print}
|
||||
#if encoding is not None:
|
||||
# encArgs['encoding'] = encoding
|
||||
#return element.encode(**encArgs)
|
||||
return element.__str__(encoding, pretty_print)
|
||||
|
||||
def setattribute(tag, name, value):
|
||||
tag[name] = value
|
||||
|
||||
def xpath(node, expr):
|
||||
"""Apply an xpath expression to a node. Return a list of nodes."""
|
||||
#path = bsoupxpath.Path(expr)
|
||||
path = bsoupxpath.get_path(expr)
|
||||
return path.apply(node)
|
||||
|
||||
|
||||
# XXX: monkey patching the beautifulsoup tag class
|
||||
class _EverythingIsNestable(dict):
|
||||
""""Fake that every tag is nestable."""
|
||||
def get(self, key, *args, **kwds):
|
||||
return []
|
||||
|
||||
BeautifulSoup.BeautifulStoneSoup.NESTABLE_TAGS = _EverythingIsNestable()
|
||||
BeautifulSoup.Tag.tag = property(fget=lambda self: self.name)
|
||||
BeautifulSoup.Tag.attrib = property(fget=lambda self: self)
|
||||
BeautifulSoup.Tag.text = property(fget=lambda self: self.string)
|
||||
BeautifulSoup.Tag.set = setattribute
|
||||
BeautifulSoup.Tag.getparent = lambda self: self.parent
|
||||
BeautifulSoup.Tag.drop_tree = BeautifulSoup.Tag.extract
|
||||
BeautifulSoup.Tag.xpath = xpath
|
||||
|
||||
# TODO: setting the text attribute for tags
|
||||
@@ -1,31 +0,0 @@
|
||||
"""
|
||||
parser.http.bsouplxml.html module (imdb.parser.http package).
|
||||
|
||||
This module adapts the beautifulsoup interface to lxml.html module.
|
||||
|
||||
Copyright 2008 H. Turgut Uyar <uyar@tekir.org>
|
||||
2008 Davide Alberani <da@erlug.linux.it>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
"""
|
||||
|
||||
import _bsoup as BeautifulSoup
|
||||
|
||||
|
||||
def fromstring(html_string):
|
||||
"""Return a DOM representation of the string."""
|
||||
return BeautifulSoup.BeautifulSoup(html_string,
|
||||
convertEntities=BeautifulSoup.BeautifulSoup.HTML_ENTITIES
|
||||
).findChild(True)
|
||||
@@ -1,203 +0,0 @@
|
||||
"""
|
||||
parser.http.characterParser module (imdb package).
|
||||
|
||||
This module provides the classes (and the instances), used to parse
|
||||
the IMDb pages on the akas.imdb.com server about a character.
|
||||
E.g., for "Jesse James" the referred pages would be:
|
||||
main details: http://www.imdb.com/character/ch0000001/
|
||||
biography: http://www.imdb.com/character/ch0000001/bio
|
||||
...and so on...
|
||||
|
||||
Copyright 2007-2009 Davide Alberani <da@erlug.linux.it>
|
||||
2008 H. Turgut Uyar <uyar@tekir.org>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
"""
|
||||
|
||||
import re
|
||||
from utils import Attribute, Extractor, DOMParserBase, build_movie, \
|
||||
analyze_imdbid
|
||||
from personParser import DOMHTMLMaindetailsParser
|
||||
|
||||
from imdb.Movie import Movie
|
||||
|
||||
_personIDs = re.compile(r'/name/nm([0-9]{7})')
|
||||
class DOMHTMLCharacterMaindetailsParser(DOMHTMLMaindetailsParser):
|
||||
"""Parser for the "filmography" page of a given character.
|
||||
The page should be provided as a string, as taken from
|
||||
the akas.imdb.com server. The final result will be a
|
||||
dictionary, with a key for every relevant section.
|
||||
|
||||
Example:
|
||||
bparser = DOMHTMLCharacterMaindetailsParser()
|
||||
result = bparser.parse(character_biography_html_string)
|
||||
"""
|
||||
_containsObjects = True
|
||||
|
||||
_film_attrs = [Attribute(key=None,
|
||||
multi=True,
|
||||
path={
|
||||
'link': "./a[1]/@href",
|
||||
'title': ".//text()",
|
||||
'status': "./i/a//text()",
|
||||
'roleID': "./a/@href"
|
||||
},
|
||||
postprocess=lambda x:
|
||||
build_movie(x.get('title') or u'',
|
||||
movieID=analyze_imdbid(x.get('link') or u''),
|
||||
roleID=_personIDs.findall(x.get('roleID') or u''),
|
||||
status=x.get('status') or None,
|
||||
_parsingCharacter=True))]
|
||||
|
||||
extractors = [
|
||||
Extractor(label='title',
|
||||
path="//title",
|
||||
attrs=Attribute(key='name',
|
||||
path="./text()",
|
||||
postprocess=lambda x: \
|
||||
x.replace(' (Character)', '').replace(
|
||||
'- Filmography by type', '').strip())),
|
||||
|
||||
Extractor(label='headshot',
|
||||
path="//a[@name='headshot']",
|
||||
attrs=Attribute(key='headshot',
|
||||
path="./img/@src")),
|
||||
|
||||
Extractor(label='akas',
|
||||
path="//div[h5='Alternate Names:']",
|
||||
attrs=Attribute(key='akas',
|
||||
path="./div//text()",
|
||||
postprocess=lambda x: x.strip().split(' / '))),
|
||||
|
||||
Extractor(label='filmography',
|
||||
path="//div[@class='filmo'][not(h5)]/ol/li",
|
||||
attrs=_film_attrs),
|
||||
|
||||
Extractor(label='filmography sections',
|
||||
group="//div[@class='filmo'][h5]",
|
||||
group_key="./h5/a/text()",
|
||||
group_key_normalize=lambda x: x.lower()[:-1],
|
||||
path="./ol/li",
|
||||
attrs=_film_attrs),
|
||||
]
|
||||
|
||||
preprocessors = [
|
||||
# Check that this doesn't cut "status"...
|
||||
(re.compile(r'<br>(\.\.\.| ).+?</li>', re.I | re.M), '</li>')]
|
||||
|
||||
|
||||
class DOMHTMLCharacterBioParser(DOMParserBase):
|
||||
"""Parser for the "biography" page of a given character.
|
||||
The page should be provided as a string, as taken from
|
||||
the akas.imdb.com server. The final result will be a
|
||||
dictionary, with a key for every relevant section.
|
||||
|
||||
Example:
|
||||
bparser = DOMHTMLCharacterBioParser()
|
||||
result = bparser.parse(character_biography_html_string)
|
||||
"""
|
||||
_defGetRefs = True
|
||||
|
||||
extractors = [
|
||||
Extractor(label='introduction',
|
||||
path="//div[@id='_intro']",
|
||||
attrs=Attribute(key='introduction',
|
||||
path=".//text()",
|
||||
postprocess=lambda x: x.strip())),
|
||||
|
||||
Extractor(label='biography',
|
||||
path="//span[@class='_biography']",
|
||||
attrs=Attribute(key='biography',
|
||||
multi=True,
|
||||
path={
|
||||
'info': "./preceding-sibling::h4[1]//text()",
|
||||
'text': ".//text()"
|
||||
},
|
||||
postprocess=lambda x: u'%s: %s' % (
|
||||
x.get('info').strip(),
|
||||
x.get('text').replace('\n',
|
||||
' ').replace('||', '\n\n').strip()))),
|
||||
]
|
||||
|
||||
preprocessors = [
|
||||
(re.compile('(<div id="swiki.2.3.1">)', re.I), r'\1<div id="_intro">'),
|
||||
(re.compile('(<a name="history">)\s*(<table .*?</table>)',
|
||||
re.I | re.DOTALL),
|
||||
r'</div>\2\1</a>'),
|
||||
(re.compile('(<a name="[^"]+">)(<h4>)', re.I), r'</span>\1</a>\2'),
|
||||
(re.compile('(</h4>)</a>', re.I), r'\1<span class="_biography">'),
|
||||
(re.compile('<br/><br/>', re.I), r'||'),
|
||||
(re.compile('\|\|\n', re.I), r'</span>'),
|
||||
]
|
||||
|
||||
|
||||
class DOMHTMLCharacterQuotesParser(DOMParserBase):
|
||||
"""Parser for the "quotes" page of a given character.
|
||||
The page should be provided as a string, as taken from
|
||||
the akas.imdb.com server. The final result will be a
|
||||
dictionary, with a key for every relevant section.
|
||||
|
||||
Example:
|
||||
qparser = DOMHTMLCharacterQuotesParser()
|
||||
result = qparser.parse(character_quotes_html_string)
|
||||
"""
|
||||
_defGetRefs = True
|
||||
|
||||
extractors = [
|
||||
Extractor(label='charquotes',
|
||||
group="//h5",
|
||||
group_key="./a/text()",
|
||||
path="./following-sibling::div[1]",
|
||||
attrs=Attribute(key=None,
|
||||
path={'txt': ".//text()",
|
||||
'movieID': ".//a[1]/@href"},
|
||||
postprocess=lambda x: (analyze_imdbid(x['movieID']),
|
||||
x['txt'].strip().replace(': ',
|
||||
': ').replace(': ', ': ').split('||'))))
|
||||
]
|
||||
|
||||
preprocessors = [
|
||||
(re.compile('(</h5>)', re.I), r'\1<div>'),
|
||||
(re.compile('\s*<br/><br/>\s*', re.I), r'||'),
|
||||
(re.compile('\|\|\s*(<hr/>)', re.I), r'</div>\1'),
|
||||
(re.compile('\s*<br/>\s*', re.I), r'::')
|
||||
]
|
||||
|
||||
def postprocess_data(self, data):
|
||||
if not data:
|
||||
return {}
|
||||
newData = {}
|
||||
for title in data:
|
||||
movieID, quotes = data[title]
|
||||
if movieID is None:
|
||||
movie = title
|
||||
else:
|
||||
movie = Movie(title=title, movieID=movieID,
|
||||
accessSystem=self._as, modFunct=self._modFunct)
|
||||
newData[movie] = [quote.split('::') for quote in quotes]
|
||||
return {'quotes': newData}
|
||||
|
||||
|
||||
from personParser import DOMHTMLSeriesParser
|
||||
|
||||
_OBJECTS = {
|
||||
'character_main_parser': ((DOMHTMLCharacterMaindetailsParser,),
|
||||
{'kind': 'character'}),
|
||||
'character_series_parser': ((DOMHTMLSeriesParser,), None),
|
||||
'character_bio_parser': ((DOMHTMLCharacterBioParser,), None),
|
||||
'character_quotes_parser': ((DOMHTMLCharacterQuotesParser,), None)
|
||||
}
|
||||
|
||||
|
||||
@@ -1,91 +0,0 @@
|
||||
"""
|
||||
parser.http.companyParser module (imdb package).
|
||||
|
||||
This module provides the classes (and the instances), used to parse
|
||||
the IMDb pages on the akas.imdb.com server about a company.
|
||||
E.g., for "Columbia Pictures [us]" the referred page would be:
|
||||
main details: http://akas.imdb.com/company/co0071509/
|
||||
|
||||
Copyright 2008-2009 Davide Alberani <da@erlug.linux.it>
|
||||
2008 H. Turgut Uyar <uyar@tekir.org>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
"""
|
||||
|
||||
import re
|
||||
from utils import build_movie, Attribute, Extractor, DOMParserBase, \
|
||||
analyze_imdbid
|
||||
|
||||
from imdb.utils import analyze_company_name
|
||||
|
||||
|
||||
class DOMCompanyParser(DOMParserBase):
|
||||
"""Parser for the main page of a given company.
|
||||
The page should be provided as a string, as taken from
|
||||
the akas.imdb.com server. The final result will be a
|
||||
dictionary, with a key for every relevant section.
|
||||
|
||||
Example:
|
||||
cparser = DOMCompanyParser()
|
||||
result = cparser.parse(company_html_string)
|
||||
"""
|
||||
_containsObjects = True
|
||||
|
||||
extractors = [
|
||||
Extractor(label='name',
|
||||
path="//title",
|
||||
attrs=Attribute(key='name',
|
||||
path="./text()",
|
||||
postprocess=lambda x: \
|
||||
analyze_company_name(x, stripNotes=True))),
|
||||
|
||||
Extractor(label='filmography',
|
||||
group="//b/a[@name]",
|
||||
group_key="./text()",
|
||||
group_key_normalize=lambda x: x.lower(),
|
||||
path="../following-sibling::ol[1]/li",
|
||||
attrs=Attribute(key=None,
|
||||
multi=True,
|
||||
path={
|
||||
'link': "./a[1]/@href",
|
||||
'title': "./a[1]/text()",
|
||||
'year': "./text()[1]"
|
||||
},
|
||||
postprocess=lambda x:
|
||||
build_movie(u'%s %s' % \
|
||||
(x.get('title'), x.get('year').strip()),
|
||||
movieID=analyze_imdbid(x.get('link') or u''),
|
||||
_parsingCompany=True))),
|
||||
]
|
||||
|
||||
preprocessors = [
|
||||
(re.compile('(<b><a name=)', re.I), r'</p>\1')
|
||||
]
|
||||
|
||||
def postprocess_data(self, data):
|
||||
for key in data.keys():
|
||||
new_key = key.replace('company', 'companies')
|
||||
new_key = new_key.replace('other', 'miscellaneous')
|
||||
new_key = new_key.replace('distributor', 'distributors')
|
||||
if new_key != key:
|
||||
data[new_key] = data[key]
|
||||
del data[key]
|
||||
return data
|
||||
|
||||
|
||||
_OBJECTS = {
|
||||
'company_main_parser': ((DOMCompanyParser,), None)
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,559 +0,0 @@
|
||||
"""
|
||||
parser.http.personParser module (imdb package).
|
||||
|
||||
This module provides the classes (and the instances), used to parse
|
||||
the IMDb pages on the akas.imdb.com server about a person.
|
||||
E.g., for "Mel Gibson" the referred pages would be:
|
||||
categorized: http://akas.imdb.com/name/nm0000154/maindetails
|
||||
biography: http://akas.imdb.com/name/nm0000154/bio
|
||||
...and so on...
|
||||
|
||||
Copyright 2004-2010 Davide Alberani <da@erlug.linux.it>
|
||||
2008 H. Turgut Uyar <uyar@tekir.org>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
"""
|
||||
|
||||
import re
|
||||
from imdb.Movie import Movie
|
||||
from imdb.utils import analyze_name, canonicalName, normalizeName, \
|
||||
analyze_title, date_and_notes
|
||||
from utils import build_movie, DOMParserBase, Attribute, Extractor, \
|
||||
analyze_imdbid
|
||||
|
||||
|
||||
from movieParser import _manageRoles
|
||||
_reRoles = re.compile(r'(<li>.*? \.\.\.\. )(.*?)(</li>|<br>)',
|
||||
re.I | re.M | re.S)
|
||||
|
||||
def build_date(date):
|
||||
day = date.get('day')
|
||||
year = date.get('year')
|
||||
if day and year:
|
||||
return "%s %s" % (day, year)
|
||||
if day:
|
||||
return day
|
||||
if year:
|
||||
return year
|
||||
return ""
|
||||
|
||||
class DOMHTMLMaindetailsParser(DOMParserBase):
|
||||
"""Parser for the "categorized" (maindetails) page of a given person.
|
||||
The page should be provided as a string, as taken from
|
||||
the akas.imdb.com server. The final result will be a
|
||||
dictionary, with a key for every relevant section.
|
||||
|
||||
Example:
|
||||
cparser = DOMHTMLMaindetailsParser()
|
||||
result = cparser.parse(categorized_html_string)
|
||||
"""
|
||||
_containsObjects = True
|
||||
|
||||
_birth_attrs = [Attribute(key='birth date',
|
||||
path={
|
||||
'day': ".//a[starts-with(@href, " \
|
||||
"'/date/')]/text()",
|
||||
'year': ".//a[starts-with(@href, " \
|
||||
"'/search/name?birth_year=')]/text()"
|
||||
},
|
||||
postprocess=build_date),
|
||||
Attribute(key='birth place',
|
||||
path=".//a[starts-with(@href, " \
|
||||
"'/search/name?birth_place=')]/text()")]
|
||||
_death_attrs = [Attribute(key='death date',
|
||||
path={
|
||||
'day': ".//a[starts-with(@href, " \
|
||||
"'/date/')]/text()",
|
||||
'year': ".//a[starts-with(@href, " \
|
||||
"'/search/name?death_year=')]/text()"
|
||||
},
|
||||
postprocess=build_date),
|
||||
Attribute(key='death place',
|
||||
path=".//a[starts-with(@href, " \
|
||||
"'/search/name?death_place=')]/text()")]
|
||||
_film_attrs = [Attribute(key=None,
|
||||
multi=True,
|
||||
path={
|
||||
'link': "./b/a[1]/@href",
|
||||
'title': "./b/a[1]/text()",
|
||||
'notes': "./b/following-sibling::text()",
|
||||
'year': "./span[@class='year_column']/text()",
|
||||
'status': "./a[@class='in_production']/text()",
|
||||
'rolesNoChar': './/br/following-sibling::text()',
|
||||
'chrRoles': "./a[@imdbpyname]/@imdbpyname",
|
||||
'roleID': "./a[starts-with(@href, '/character/')]/@href"
|
||||
},
|
||||
postprocess=lambda x:
|
||||
build_movie(x.get('title') or u'',
|
||||
year=x.get('year'),
|
||||
movieID=analyze_imdbid(x.get('link') or u''),
|
||||
rolesNoChar=(x.get('rolesNoChar') or u'').strip(),
|
||||
chrRoles=(x.get('chrRoles') or u'').strip(),
|
||||
additionalNotes=x.get('notes'),
|
||||
roleID=(x.get('roleID') or u''),
|
||||
status=x.get('status') or None))]
|
||||
|
||||
extractors = [
|
||||
Extractor(label='name',
|
||||
path="//h1[@class='header']",
|
||||
attrs=Attribute(key='name',
|
||||
path=".//text()",
|
||||
postprocess=lambda x: analyze_name(x,
|
||||
canonical=1))),
|
||||
|
||||
Extractor(label='birth info',
|
||||
path="//div[h4='Born:']",
|
||||
attrs=_birth_attrs),
|
||||
|
||||
Extractor(label='death info',
|
||||
path="//div[h4='Died:']",
|
||||
attrs=_death_attrs),
|
||||
|
||||
Extractor(label='headshot',
|
||||
path="//td[@id='img_primary']/a",
|
||||
attrs=Attribute(key='headshot',
|
||||
path="./img/@src")),
|
||||
|
||||
Extractor(label='akas',
|
||||
path="//div[h4='Alternate Names:']",
|
||||
attrs=Attribute(key='akas',
|
||||
path="./text()",
|
||||
postprocess=lambda x: x.strip().split(' '))),
|
||||
|
||||
Extractor(label='filmography',
|
||||
group="//div[starts-with(@id, 'filmo-head-')]",
|
||||
group_key="./a[@name]/text()",
|
||||
group_key_normalize=lambda x: x.lower().replace(': ', ' '),
|
||||
path="./following-sibling::div[1]" \
|
||||
"/div[starts-with(@class, 'filmo-row')]",
|
||||
attrs=_film_attrs),
|
||||
|
||||
Extractor(label='indevelopment',
|
||||
path="//div[starts-with(@class,'devitem')]",
|
||||
attrs=Attribute(key='in development',
|
||||
multi=True,
|
||||
path={
|
||||
'link': './a/@href',
|
||||
'title': './a/text()'
|
||||
},
|
||||
postprocess=lambda x:
|
||||
build_movie(x.get('title') or u'',
|
||||
movieID=analyze_imdbid(x.get('link') or u''),
|
||||
roleID=(x.get('roleID') or u'').split('/'),
|
||||
status=x.get('status') or None)))
|
||||
]
|
||||
|
||||
preprocessors = [('<div class="clear"/> </div>', ''),
|
||||
('<br/>', '<br />'),
|
||||
(re.compile(r'(<a href="/character/ch[0-9]{7}")>(.*?)</a>'),
|
||||
r'\1 imdbpyname="\2@@">\2</a>')]
|
||||
|
||||
def postprocess_data(self, data):
|
||||
for what in 'birth date', 'death date':
|
||||
if what in data and not data[what]:
|
||||
del data[what]
|
||||
# XXX: the code below is for backwards compatibility
|
||||
# probably could be removed
|
||||
for key in data.keys():
|
||||
if key.startswith('actor '):
|
||||
if not data.has_key('actor'):
|
||||
data['actor'] = []
|
||||
data['actor'].extend(data[key])
|
||||
del data[key]
|
||||
if key.startswith('actress '):
|
||||
if not data.has_key('actress'):
|
||||
data['actress'] = []
|
||||
data['actress'].extend(data[key])
|
||||
del data[key]
|
||||
if key.startswith('self '):
|
||||
if not data.has_key('self'):
|
||||
data['self'] = []
|
||||
data['self'].extend(data[key])
|
||||
del data[key]
|
||||
if key == 'birth place':
|
||||
data['birth notes'] = data[key]
|
||||
del data[key]
|
||||
if key == 'death place':
|
||||
data['death notes'] = data[key]
|
||||
del data[key]
|
||||
return data
|
||||
|
||||
|
||||
class DOMHTMLBioParser(DOMParserBase):
|
||||
"""Parser for the "biography" page of a given person.
|
||||
The page should be provided as a string, as taken from
|
||||
the akas.imdb.com server. The final result will be a
|
||||
dictionary, with a key for every relevant section.
|
||||
|
||||
Example:
|
||||
bioparser = DOMHTMLBioParser()
|
||||
result = bioparser.parse(biography_html_string)
|
||||
"""
|
||||
_defGetRefs = True
|
||||
|
||||
_birth_attrs = [Attribute(key='birth date',
|
||||
path={
|
||||
'day': "./a[starts-with(@href, " \
|
||||
"'/date/')]/text()",
|
||||
'year': "./a[starts-with(@href, " \
|
||||
"'/search/name?birth_year=')]/text()"
|
||||
},
|
||||
postprocess=build_date),
|
||||
Attribute(key='birth notes',
|
||||
path="./a[starts-with(@href, " \
|
||||
"'/search/name?birth_place=')]/text()")]
|
||||
_death_attrs = [Attribute(key='death date',
|
||||
path={
|
||||
'day': "./a[starts-with(@href, " \
|
||||
"'/date/')]/text()",
|
||||
'year': "./a[starts-with(@href, " \
|
||||
"'/search/name?death_date=')]/text()"
|
||||
},
|
||||
postprocess=build_date),
|
||||
Attribute(key='death notes',
|
||||
path="./text()",
|
||||
# TODO: check if this slicing is always correct
|
||||
postprocess=lambda x: u''.join(x).strip()[2:])]
|
||||
extractors = [
|
||||
Extractor(label='headshot',
|
||||
path="//a[@name='headshot']",
|
||||
attrs=Attribute(key='headshot',
|
||||
path="./img/@src")),
|
||||
Extractor(label='birth info',
|
||||
path="//div[h5='Date of Birth']",
|
||||
attrs=_birth_attrs),
|
||||
Extractor(label='death info',
|
||||
path="//div[h5='Date of Death']",
|
||||
attrs=_death_attrs),
|
||||
Extractor(label='nick names',
|
||||
path="//div[h5='Nickname']",
|
||||
attrs=Attribute(key='nick names',
|
||||
path="./text()",
|
||||
joiner='|',
|
||||
postprocess=lambda x: [n.strip().replace(' (',
|
||||
'::(', 1) for n in x.split('|')
|
||||
if n.strip()])),
|
||||
Extractor(label='birth name',
|
||||
path="//div[h5='Birth Name']",
|
||||
attrs=Attribute(key='birth name',
|
||||
path="./text()",
|
||||
postprocess=lambda x: canonicalName(x.strip()))),
|
||||
Extractor(label='height',
|
||||
path="//div[h5='Height']",
|
||||
attrs=Attribute(key='height',
|
||||
path="./text()",
|
||||
postprocess=lambda x: x.strip())),
|
||||
Extractor(label='mini biography',
|
||||
path="//div[h5='Mini Biography']",
|
||||
attrs=Attribute(key='mini biography',
|
||||
multi=True,
|
||||
path={
|
||||
'bio': "./p//text()",
|
||||
'by': "./b/following-sibling::a/text()"
|
||||
},
|
||||
postprocess=lambda x: "%s::%s" % \
|
||||
(x.get('bio').strip(),
|
||||
(x.get('by') or u'').strip() or u'Anonymous'))),
|
||||
Extractor(label='spouse',
|
||||
path="//div[h5='Spouse']/table/tr",
|
||||
attrs=Attribute(key='spouse',
|
||||
multi=True,
|
||||
path={
|
||||
'name': "./td[1]//text()",
|
||||
'info': "./td[2]//text()"
|
||||
},
|
||||
postprocess=lambda x: ("%s::%s" % \
|
||||
(x.get('name').strip(),
|
||||
(x.get('info') or u'').strip())).strip(':'))),
|
||||
Extractor(label='trade mark',
|
||||
path="//div[h5='Trade Mark']/p",
|
||||
attrs=Attribute(key='trade mark',
|
||||
multi=True,
|
||||
path=".//text()",
|
||||
postprocess=lambda x: x.strip())),
|
||||
Extractor(label='trivia',
|
||||
path="//div[h5='Trivia']/p",
|
||||
attrs=Attribute(key='trivia',
|
||||
multi=True,
|
||||
path=".//text()",
|
||||
postprocess=lambda x: x.strip())),
|
||||
Extractor(label='quotes',
|
||||
path="//div[h5='Personal Quotes']/p",
|
||||
attrs=Attribute(key='quotes',
|
||||
multi=True,
|
||||
path=".//text()",
|
||||
postprocess=lambda x: x.strip())),
|
||||
Extractor(label='salary',
|
||||
path="//div[h5='Salary']/table/tr",
|
||||
attrs=Attribute(key='salary history',
|
||||
multi=True,
|
||||
path={
|
||||
'title': "./td[1]//text()",
|
||||
'info': "./td[2]/text()",
|
||||
},
|
||||
postprocess=lambda x: "%s::%s" % \
|
||||
(x.get('title').strip(),
|
||||
x.get('info').strip()))),
|
||||
Extractor(label='where now',
|
||||
path="//div[h5='Where Are They Now']/p",
|
||||
attrs=Attribute(key='where now',
|
||||
multi=True,
|
||||
path=".//text()",
|
||||
postprocess=lambda x: x.strip())),
|
||||
]
|
||||
|
||||
preprocessors = [
|
||||
(re.compile('(<h5>)', re.I), r'</div><div class="_imdbpy">\1'),
|
||||
(re.compile('(</table>\n</div>\s+)</div>', re.I + re.DOTALL), r'\1'),
|
||||
(re.compile('(<div id="tn15bot">)'), r'</div>\1'),
|
||||
(re.compile('\.<br><br>([^\s])', re.I), r'. \1')
|
||||
]
|
||||
|
||||
def postprocess_data(self, data):
|
||||
for what in 'birth date', 'death date':
|
||||
if what in data and not data[what]:
|
||||
del data[what]
|
||||
return data
|
||||
|
||||
|
||||
class DOMHTMLOtherWorksParser(DOMParserBase):
|
||||
"""Parser for the "other works" and "agent" pages of a given person.
|
||||
The page should be provided as a string, as taken from
|
||||
the akas.imdb.com server. The final result will be a
|
||||
dictionary, with a key for every relevant section.
|
||||
|
||||
Example:
|
||||
owparser = DOMHTMLOtherWorksParser()
|
||||
result = owparser.parse(otherworks_html_string)
|
||||
"""
|
||||
_defGetRefs = True
|
||||
kind = 'other works'
|
||||
|
||||
# XXX: looks like the 'agent' page is no more public.
|
||||
extractors = [
|
||||
Extractor(label='other works',
|
||||
path="//h5[text()='Other works']/" \
|
||||
"following-sibling::div[1]",
|
||||
attrs=Attribute(key='self.kind',
|
||||
path=".//text()",
|
||||
postprocess=lambda x: x.strip().split('\n\n')))
|
||||
]
|
||||
|
||||
preprocessors = [
|
||||
(re.compile('(<h5>[^<]+</h5>)', re.I),
|
||||
r'</div>\1<div class="_imdbpy">'),
|
||||
(re.compile('(</table>\n</div>\s+)</div>', re.I), r'\1'),
|
||||
(re.compile('(<div id="tn15bot">)'), r'</div>\1'),
|
||||
(re.compile('<br/><br/>', re.I), r'\n\n')
|
||||
]
|
||||
|
||||
|
||||
def _build_episode(link, title, minfo, role, roleA, roleAID):
|
||||
"""Build an Movie object for a given episode of a series."""
|
||||
episode_id = analyze_imdbid(link)
|
||||
notes = u''
|
||||
minidx = minfo.find(' -')
|
||||
# Sometimes, for some unknown reason, the role is left in minfo.
|
||||
if minidx != -1:
|
||||
slfRole = minfo[minidx+3:].lstrip()
|
||||
minfo = minfo[:minidx].rstrip()
|
||||
if slfRole.endswith(')'):
|
||||
commidx = slfRole.rfind('(')
|
||||
if commidx != -1:
|
||||
notes = slfRole[commidx:]
|
||||
slfRole = slfRole[:commidx]
|
||||
if slfRole and role is None and roleA is None:
|
||||
role = slfRole
|
||||
eps_data = analyze_title(title)
|
||||
eps_data['kind'] = u'episode'
|
||||
# FIXME: it's wrong for multiple characters (very rare on tv series?).
|
||||
if role is None:
|
||||
role = roleA # At worse, it's None.
|
||||
if role is None:
|
||||
roleAID = None
|
||||
if roleAID is not None:
|
||||
roleAID = analyze_imdbid(roleAID)
|
||||
e = Movie(movieID=episode_id, data=eps_data, currentRole=role,
|
||||
roleID=roleAID, notes=notes)
|
||||
# XXX: are we missing some notes?
|
||||
# XXX: does it parse things as "Episode dated 12 May 2005 (12 May 2005)"?
|
||||
if minfo.startswith('('):
|
||||
pe = minfo.find(')')
|
||||
if pe != -1:
|
||||
date = minfo[1:pe]
|
||||
if date != '????':
|
||||
e['original air date'] = date
|
||||
if eps_data.get('year', '????') == '????':
|
||||
syear = date.split()[-1]
|
||||
if syear.isdigit():
|
||||
e['year'] = int(syear)
|
||||
return e
|
||||
|
||||
|
||||
class DOMHTMLSeriesParser(DOMParserBase):
|
||||
"""Parser for the "by TV series" page of a given person.
|
||||
The page should be provided as a string, as taken from
|
||||
the akas.imdb.com server. The final result will be a
|
||||
dictionary, with a key for every relevant section.
|
||||
|
||||
Example:
|
||||
sparser = DOMHTMLSeriesParser()
|
||||
result = sparser.parse(filmoseries_html_string)
|
||||
"""
|
||||
_containsObjects = True
|
||||
|
||||
extractors = [
|
||||
Extractor(label='series',
|
||||
group="//div[@class='filmo']/span[1]",
|
||||
group_key="./a[1]",
|
||||
path="./following-sibling::ol[1]/li/a[1]",
|
||||
attrs=Attribute(key=None,
|
||||
multi=True,
|
||||
path={
|
||||
'link': "./@href",
|
||||
'title': "./text()",
|
||||
'info': "./following-sibling::text()",
|
||||
'role': "./following-sibling::i[1]/text()",
|
||||
'roleA': "./following-sibling::a[1]/text()",
|
||||
'roleAID': "./following-sibling::a[1]/@href"
|
||||
},
|
||||
postprocess=lambda x: _build_episode(x.get('link'),
|
||||
x.get('title'),
|
||||
(x.get('info') or u'').strip(),
|
||||
x.get('role'),
|
||||
x.get('roleA'),
|
||||
x.get('roleAID'))))
|
||||
]
|
||||
|
||||
def postprocess_data(self, data):
|
||||
if len(data) == 0:
|
||||
return {}
|
||||
nd = {}
|
||||
for key in data.keys():
|
||||
dom = self.get_dom(key)
|
||||
link = self.xpath(dom, "//a/@href")[0]
|
||||
title = self.xpath(dom, "//a/text()")[0][1:-1]
|
||||
series = Movie(movieID=analyze_imdbid(link),
|
||||
data=analyze_title(title),
|
||||
accessSystem=self._as, modFunct=self._modFunct)
|
||||
nd[series] = []
|
||||
for episode in data[key]:
|
||||
# XXX: should we create a copy of 'series', to avoid
|
||||
# circular references?
|
||||
episode['episode of'] = series
|
||||
nd[series].append(episode)
|
||||
return {'episodes': nd}
|
||||
|
||||
|
||||
class DOMHTMLPersonGenresParser(DOMParserBase):
|
||||
"""Parser for the "by genre" and "by keywords" pages of a given person.
|
||||
The page should be provided as a string, as taken from
|
||||
the akas.imdb.com server. The final result will be a
|
||||
dictionary, with a key for every relevant section.
|
||||
|
||||
Example:
|
||||
gparser = DOMHTMLPersonGenresParser()
|
||||
result = gparser.parse(bygenre_html_string)
|
||||
"""
|
||||
kind = 'genres'
|
||||
_containsObjects = True
|
||||
|
||||
extractors = [
|
||||
Extractor(label='genres',
|
||||
group="//b/a[@name]/following-sibling::a[1]",
|
||||
group_key="./text()",
|
||||
group_key_normalize=lambda x: x.lower(),
|
||||
path="../../following-sibling::ol[1]/li//a[1]",
|
||||
attrs=Attribute(key=None,
|
||||
multi=True,
|
||||
path={
|
||||
'link': "./@href",
|
||||
'title': "./text()",
|
||||
'info': "./following-sibling::text()"
|
||||
},
|
||||
postprocess=lambda x: \
|
||||
build_movie(x.get('title') + \
|
||||
x.get('info').split('[')[0],
|
||||
analyze_imdbid(x.get('link')))))
|
||||
]
|
||||
|
||||
def postprocess_data(self, data):
|
||||
if len(data) == 0:
|
||||
return {}
|
||||
return {self.kind: data}
|
||||
|
||||
|
||||
from movieParser import _parse_merchandising_link
|
||||
|
||||
class DOMHTMLPersonSalesParser(DOMParserBase):
|
||||
"""Parser for the "merchandising links" page of a given person.
|
||||
The page should be provided as a string, as taken from
|
||||
the akas.imdb.com server. The final result will be a
|
||||
dictionary, with a key for every relevant section.
|
||||
|
||||
Example:
|
||||
sparser = DOMHTMLPersonSalesParser()
|
||||
result = sparser.parse(sales_html_string)
|
||||
"""
|
||||
extractors = [
|
||||
Extractor(label='merchandising links',
|
||||
group="//span[@class='merch_title']",
|
||||
group_key=".//text()",
|
||||
path="./following-sibling::table[1]/" \
|
||||
"/td[@class='w_rowtable_colshop']//tr[1]",
|
||||
attrs=Attribute(key=None,
|
||||
multi=True,
|
||||
path={
|
||||
'link': "./td[2]/a[1]/@href",
|
||||
'text': "./td[1]/img[1]/@alt",
|
||||
'cover': "./ancestor::td[1]/../" \
|
||||
"td[1]/a[1]/img[1]/@src",
|
||||
},
|
||||
postprocess=_parse_merchandising_link)),
|
||||
]
|
||||
|
||||
preprocessors = [
|
||||
(re.compile('(<a name="[^"]+" )/>', re.I), r'\1></a>')
|
||||
]
|
||||
|
||||
def postprocess_data(self, data):
|
||||
if len(data) == 0:
|
||||
return {}
|
||||
return {'merchandising links': data}
|
||||
|
||||
|
||||
from movieParser import DOMHTMLTechParser
|
||||
from movieParser import DOMHTMLOfficialsitesParser
|
||||
from movieParser import DOMHTMLAwardsParser
|
||||
from movieParser import DOMHTMLNewsParser
|
||||
|
||||
|
||||
_OBJECTS = {
|
||||
'maindetails_parser': ((DOMHTMLMaindetailsParser,), None),
|
||||
'bio_parser': ((DOMHTMLBioParser,), None),
|
||||
'otherworks_parser': ((DOMHTMLOtherWorksParser,), None),
|
||||
#'agent_parser': ((DOMHTMLOtherWorksParser,), {'kind': 'agent'}),
|
||||
'person_officialsites_parser': ((DOMHTMLOfficialsitesParser,), None),
|
||||
'person_awards_parser': ((DOMHTMLAwardsParser,), {'subject': 'name'}),
|
||||
'publicity_parser': ((DOMHTMLTechParser,), {'kind': 'publicity'}),
|
||||
'person_series_parser': ((DOMHTMLSeriesParser,), None),
|
||||
'person_contacts_parser': ((DOMHTMLTechParser,), {'kind': 'contacts'}),
|
||||
'person_genres_parser': ((DOMHTMLPersonGenresParser,), None),
|
||||
'person_keywords_parser': ((DOMHTMLPersonGenresParser,),
|
||||
{'kind': 'keywords'}),
|
||||
'news_parser': ((DOMHTMLNewsParser,), None),
|
||||
'sales_parser': ((DOMHTMLPersonSalesParser,), None)
|
||||
}
|
||||
|
||||
@@ -1,69 +0,0 @@
|
||||
"""
|
||||
parser.http.searchCharacterParser module (imdb package).
|
||||
|
||||
This module provides the HTMLSearchCharacterParser class (and the
|
||||
search_character_parser instance), used to parse the results of a search
|
||||
for a given character.
|
||||
E.g., when searching for the name "Jesse James", the parsed page would be:
|
||||
http://akas.imdb.com/find?s=Characters;mx=20;q=Jesse+James
|
||||
|
||||
Copyright 2007-2009 Davide Alberani <da@erlug.linux.it>
|
||||
2008 H. Turgut Uyar <uyar@tekir.org>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
"""
|
||||
|
||||
from imdb.utils import analyze_name, build_name
|
||||
from utils import Extractor, Attribute, analyze_imdbid
|
||||
|
||||
from searchMovieParser import DOMHTMLSearchMovieParser, DOMBasicMovieParser
|
||||
|
||||
|
||||
class DOMBasicCharacterParser(DOMBasicMovieParser):
|
||||
"""Simply get the name of a character and the imdbID.
|
||||
|
||||
It's used by the DOMHTMLSearchCharacterParser class to return a result
|
||||
for a direct match (when a search on IMDb results in a single
|
||||
character, the web server sends directly the movie page."""
|
||||
_titleFunct = lambda self, x: analyze_name(x or u'', canonical=False)
|
||||
|
||||
|
||||
class DOMHTMLSearchCharacterParser(DOMHTMLSearchMovieParser):
|
||||
_BaseParser = DOMBasicCharacterParser
|
||||
_notDirectHitTitle = '<title>imdb search'
|
||||
_titleBuilder = lambda self, x: build_name(x, canonical=False)
|
||||
_linkPrefix = '/character/ch'
|
||||
|
||||
_attrs = [Attribute(key='data',
|
||||
multi=True,
|
||||
path={
|
||||
'link': "./a[1]/@href",
|
||||
'name': "./a[1]/text()"
|
||||
},
|
||||
postprocess=lambda x: (
|
||||
analyze_imdbid(x.get('link') or u''),
|
||||
{'name': x.get('name')}
|
||||
))]
|
||||
extractors = [Extractor(label='search',
|
||||
path="//td[3]/a[starts-with(@href, " \
|
||||
"'/character/ch')]/..",
|
||||
attrs=_attrs)]
|
||||
|
||||
|
||||
_OBJECTS = {
|
||||
'search_character_parser': ((DOMHTMLSearchCharacterParser,),
|
||||
{'kind': 'character', '_basic_parser': DOMBasicCharacterParser})
|
||||
}
|
||||
|
||||
@@ -1,71 +0,0 @@
|
||||
"""
|
||||
parser.http.searchCompanyParser module (imdb package).
|
||||
|
||||
This module provides the HTMLSearchCompanyParser class (and the
|
||||
search_company_parser instance), used to parse the results of a search
|
||||
for a given company.
|
||||
E.g., when searching for the name "Columbia Pictures", the parsed page would be:
|
||||
http://akas.imdb.com/find?s=co;mx=20;q=Columbia+Pictures
|
||||
|
||||
Copyright 2008-2009 Davide Alberani <da@erlug.linux.it>
|
||||
2008 H. Turgut Uyar <uyar@tekir.org>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
"""
|
||||
|
||||
from imdb.utils import analyze_company_name, build_company_name
|
||||
from utils import Extractor, Attribute, analyze_imdbid
|
||||
|
||||
from searchMovieParser import DOMHTMLSearchMovieParser, DOMBasicMovieParser
|
||||
|
||||
class DOMBasicCompanyParser(DOMBasicMovieParser):
|
||||
"""Simply get the name of a company and the imdbID.
|
||||
|
||||
It's used by the DOMHTMLSearchCompanyParser class to return a result
|
||||
for a direct match (when a search on IMDb results in a single
|
||||
company, the web server sends directly the company page.
|
||||
"""
|
||||
_titleFunct = lambda self, x: analyze_company_name(x or u'')
|
||||
|
||||
|
||||
class DOMHTMLSearchCompanyParser(DOMHTMLSearchMovieParser):
|
||||
_BaseParser = DOMBasicCompanyParser
|
||||
_notDirectHitTitle = '<title>imdb company'
|
||||
_titleBuilder = lambda self, x: build_company_name(x)
|
||||
_linkPrefix = '/company/co'
|
||||
|
||||
_attrs = [Attribute(key='data',
|
||||
multi=True,
|
||||
path={
|
||||
'link': "./a[1]/@href",
|
||||
'name': "./a[1]/text()",
|
||||
'notes': "./text()[1]"
|
||||
},
|
||||
postprocess=lambda x: (
|
||||
analyze_imdbid(x.get('link')),
|
||||
analyze_company_name(x.get('name')+(x.get('notes')
|
||||
or u''), stripNotes=True)
|
||||
))]
|
||||
extractors = [Extractor(label='search',
|
||||
path="//td[3]/a[starts-with(@href, " \
|
||||
"'/company/co')]/..",
|
||||
attrs=_attrs)]
|
||||
|
||||
|
||||
_OBJECTS = {
|
||||
'search_company_parser': ((DOMHTMLSearchCompanyParser,),
|
||||
{'kind': 'company', '_basic_parser': DOMBasicCompanyParser})
|
||||
}
|
||||
|
||||
@@ -1,111 +0,0 @@
|
||||
"""
|
||||
parser.http.searchKeywordParser module (imdb package).
|
||||
|
||||
This module provides the HTMLSearchKeywordParser class (and the
|
||||
search_company_parser instance), used to parse the results of a search
|
||||
for a given keyword.
|
||||
E.g., when searching for the keyword "alabama", the parsed page would be:
|
||||
http://akas.imdb.com/find?s=kw;mx=20;q=alabama
|
||||
|
||||
Copyright 2009 Davide Alberani <da@erlug.linux.it>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
"""
|
||||
|
||||
from utils import Extractor, Attribute, analyze_imdbid
|
||||
from imdb.utils import analyze_title, analyze_company_name
|
||||
|
||||
from searchMovieParser import DOMHTMLSearchMovieParser, DOMBasicMovieParser
|
||||
|
||||
class DOMBasicKeywordParser(DOMBasicMovieParser):
|
||||
"""Simply get the name of a keyword.
|
||||
|
||||
It's used by the DOMHTMLSearchKeywordParser class to return a result
|
||||
for a direct match (when a search on IMDb results in a single
|
||||
keyword, the web server sends directly the keyword page.
|
||||
"""
|
||||
# XXX: it's still to be tested!
|
||||
# I'm not even sure there can be a direct hit, searching for keywords.
|
||||
_titleFunct = lambda self, x: analyze_company_name(x or u'')
|
||||
|
||||
|
||||
class DOMHTMLSearchKeywordParser(DOMHTMLSearchMovieParser):
|
||||
"""Parse the html page that the IMDb web server shows when the
|
||||
"new search system" is used, searching for keywords similar to
|
||||
the one given."""
|
||||
|
||||
_BaseParser = DOMBasicKeywordParser
|
||||
_notDirectHitTitle = '<title>imdb keyword'
|
||||
_titleBuilder = lambda self, x: x
|
||||
_linkPrefix = '/keyword/'
|
||||
|
||||
_attrs = [Attribute(key='data',
|
||||
multi=True,
|
||||
path="./a[1]/text()"
|
||||
)]
|
||||
extractors = [Extractor(label='search',
|
||||
path="//td[3]/a[starts-with(@href, " \
|
||||
"'/keyword/')]/..",
|
||||
attrs=_attrs)]
|
||||
|
||||
|
||||
def custom_analyze_title4kwd(title, yearNote, outline):
|
||||
"""Return a dictionary with the needed info."""
|
||||
title = title.strip()
|
||||
if not title:
|
||||
return {}
|
||||
if yearNote:
|
||||
yearNote = '%s)' % yearNote.split(' ')[0]
|
||||
title = title + ' ' + yearNote
|
||||
retDict = analyze_title(title)
|
||||
if outline:
|
||||
retDict['plot outline'] = outline
|
||||
return retDict
|
||||
|
||||
|
||||
class DOMHTMLSearchMovieKeywordParser(DOMHTMLSearchMovieParser):
|
||||
"""Parse the html page that the IMDb web server shows when the
|
||||
"new search system" is used, searching for movies with the given
|
||||
keyword."""
|
||||
|
||||
_notDirectHitTitle = '<title>best'
|
||||
|
||||
_attrs = [Attribute(key='data',
|
||||
multi=True,
|
||||
path={
|
||||
'link': "./a[1]/@href",
|
||||
'info': "./a[1]//text()",
|
||||
'ynote': "./span[@class='desc']/text()",
|
||||
'outline': "./span[@class='outline']//text()"
|
||||
},
|
||||
postprocess=lambda x: (
|
||||
analyze_imdbid(x.get('link') or u''),
|
||||
custom_analyze_title4kwd(x.get('info') or u'',
|
||||
x.get('ynote') or u'',
|
||||
x.get('outline') or u'')
|
||||
))]
|
||||
|
||||
extractors = [Extractor(label='search',
|
||||
path="//td[3]/a[starts-with(@href, " \
|
||||
"'/title/tt')]/..",
|
||||
attrs=_attrs)]
|
||||
|
||||
|
||||
_OBJECTS = {
|
||||
'search_keyword_parser': ((DOMHTMLSearchKeywordParser,),
|
||||
{'kind': 'keyword', '_basic_parser': DOMBasicKeywordParser}),
|
||||
'search_moviekeyword_parser': ((DOMHTMLSearchMovieKeywordParser,), None)
|
||||
}
|
||||
|
||||
@@ -1,178 +0,0 @@
|
||||
"""
|
||||
parser.http.searchMovieParser module (imdb package).
|
||||
|
||||
This module provides the HTMLSearchMovieParser class (and the
|
||||
search_movie_parser instance), used to parse the results of a search
|
||||
for a given title.
|
||||
E.g., for when searching for the title "the passion", the parsed
|
||||
page would be:
|
||||
http://akas.imdb.com/find?q=the+passion&tt=on&mx=20
|
||||
|
||||
Copyright 2004-2010 Davide Alberani <da@erlug.linux.it>
|
||||
2008 H. Turgut Uyar <uyar@tekir.org>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
"""
|
||||
|
||||
import re
|
||||
from imdb.utils import analyze_title, build_title
|
||||
from utils import DOMParserBase, Attribute, Extractor, analyze_imdbid
|
||||
|
||||
|
||||
class DOMBasicMovieParser(DOMParserBase):
|
||||
"""Simply get the title of a movie and the imdbID.
|
||||
|
||||
It's used by the DOMHTMLSearchMovieParser class to return a result
|
||||
for a direct match (when a search on IMDb results in a single
|
||||
movie, the web server sends directly the movie page."""
|
||||
# Stay generic enough to be used also for other DOMBasic*Parser classes.
|
||||
_titleAttrPath = ".//text()"
|
||||
_linkPath = "//link[@rel='canonical']"
|
||||
_titleFunct = lambda self, x: analyze_title(x or u'')
|
||||
|
||||
def _init(self):
|
||||
self.preprocessors += [('<span class="tv-extra">TV mini-series</span>',
|
||||
'<span class="tv-extra">(mini)</span>')]
|
||||
self.extractors = [Extractor(label='title',
|
||||
path="//h1",
|
||||
attrs=Attribute(key='title',
|
||||
path=self._titleAttrPath,
|
||||
postprocess=self._titleFunct)),
|
||||
Extractor(label='link',
|
||||
path=self._linkPath,
|
||||
attrs=Attribute(key='link', path="./@href",
|
||||
postprocess=lambda x: \
|
||||
analyze_imdbid((x or u'').replace(
|
||||
'http://pro.imdb.com', ''))
|
||||
))]
|
||||
|
||||
# Remove 'More at IMDb Pro' links.
|
||||
preprocessors = [(re.compile(r'<span class="pro-link".*?</span>'), ''),
|
||||
(re.compile(r'<a href="http://ad.doubleclick.net.*?;id=(co[0-9]{7});'), r'<a href="http://pro.imdb.com/company/\1"></a>< a href="')]
|
||||
|
||||
def postprocess_data(self, data):
|
||||
if not 'link' in data:
|
||||
data = []
|
||||
else:
|
||||
link = data.pop('link')
|
||||
if (link and data):
|
||||
data = [(link, data)]
|
||||
else:
|
||||
data = []
|
||||
return data
|
||||
|
||||
|
||||
def custom_analyze_title(title):
|
||||
"""Remove garbage notes after the (year), (year/imdbIndex) or (year) (TV)"""
|
||||
# XXX: very crappy. :-(
|
||||
nt = title.split(' ')[0]
|
||||
if nt:
|
||||
title = nt
|
||||
if not title:
|
||||
return {}
|
||||
return analyze_title(title)
|
||||
|
||||
# Manage AKAs.
|
||||
_reAKAStitles = re.compile(r'(?:aka) <em>"(.*?)(<br>|<\/td>)', re.I | re.M)
|
||||
|
||||
class DOMHTMLSearchMovieParser(DOMParserBase):
|
||||
"""Parse the html page that the IMDb web server shows when the
|
||||
"new search system" is used, for movies."""
|
||||
|
||||
_BaseParser = DOMBasicMovieParser
|
||||
_notDirectHitTitle = '<title>imdb title'
|
||||
_titleBuilder = lambda self, x: build_title(x)
|
||||
_linkPrefix = '/title/tt'
|
||||
|
||||
_attrs = [Attribute(key='data',
|
||||
multi=True,
|
||||
path={
|
||||
'link': "./a[1]/@href",
|
||||
'info': ".//text()",
|
||||
#'akas': ".//div[@class='_imdbpyAKA']//text()"
|
||||
'akas': ".//p[@class='find-aka']//text()"
|
||||
},
|
||||
postprocess=lambda x: (
|
||||
analyze_imdbid(x.get('link') or u''),
|
||||
custom_analyze_title(x.get('info') or u''),
|
||||
x.get('akas')
|
||||
))]
|
||||
extractors = [Extractor(label='search',
|
||||
path="//td[3]/a[starts-with(@href, '/title/tt')]/..",
|
||||
attrs=_attrs)]
|
||||
def _init(self):
|
||||
self.url = u''
|
||||
|
||||
def _reset(self):
|
||||
self.url = u''
|
||||
|
||||
def preprocess_string(self, html_string):
|
||||
if self._notDirectHitTitle in html_string[:1024].lower():
|
||||
if self._linkPrefix == '/title/tt':
|
||||
# Only for movies.
|
||||
html_string = html_string.replace('(TV mini-series)', '(mini)')
|
||||
html_string = html_string.replace('<p class="find-aka">',
|
||||
'<p class="find-aka">::')
|
||||
#html_string = _reAKAStitles.sub(
|
||||
# r'<div class="_imdbpyAKA">\1::</div>\2', html_string)
|
||||
return html_string
|
||||
# Direct hit!
|
||||
dbme = self._BaseParser(useModule=self._useModule)
|
||||
res = dbme.parse(html_string, url=self.url)
|
||||
if not res: return u''
|
||||
res = res['data']
|
||||
if not (res and res[0]): return u''
|
||||
link = '%s%s' % (self._linkPrefix, res[0][0])
|
||||
# # Tries to cope with companies for which links to pro.imdb.com
|
||||
# # are missing.
|
||||
# link = self.url.replace(imdbURL_base[:-1], '')
|
||||
title = self._titleBuilder(res[0][1])
|
||||
if not (link and title): return u''
|
||||
link = link.replace('http://pro.imdb.com', '')
|
||||
new_html = '<td></td><td></td><td><a href="%s">%s</a></td>' % (link,
|
||||
title)
|
||||
return new_html
|
||||
|
||||
def postprocess_data(self, data):
|
||||
if not data.has_key('data'):
|
||||
data['data'] = []
|
||||
results = getattr(self, 'results', None)
|
||||
if results is not None:
|
||||
data['data'][:] = data['data'][:results]
|
||||
# Horrible hack to support AKAs.
|
||||
if data and data['data'] and len(data['data'][0]) == 3 and \
|
||||
isinstance(data['data'][0], tuple):
|
||||
for idx, datum in enumerate(data['data']):
|
||||
if not isinstance(datum, tuple):
|
||||
continue
|
||||
if datum[2] is not None:
|
||||
akas = filter(None, datum[2].split('::'))
|
||||
if self._linkPrefix == '/title/tt':
|
||||
akas = [a.replace('" - ', '::').rstrip() for a in akas]
|
||||
akas = [a.replace('aka "', '', 1).lstrip() for a in akas]
|
||||
datum[1]['akas'] = akas
|
||||
data['data'][idx] = (datum[0], datum[1])
|
||||
else:
|
||||
data['data'][idx] = (datum[0], datum[1])
|
||||
return data
|
||||
|
||||
def add_refs(self, data):
|
||||
return data
|
||||
|
||||
|
||||
_OBJECTS = {
|
||||
'search_movie_parser': ((DOMHTMLSearchMovieParser,), None)
|
||||
}
|
||||
|
||||
@@ -1,92 +0,0 @@
|
||||
"""
|
||||
parser.http.searchPersonParser module (imdb package).
|
||||
|
||||
This module provides the HTMLSearchPersonParser class (and the
|
||||
search_person_parser instance), used to parse the results of a search
|
||||
for a given person.
|
||||
E.g., when searching for the name "Mel Gibson", the parsed page would be:
|
||||
http://akas.imdb.com/find?q=Mel+Gibson&nm=on&mx=20
|
||||
|
||||
Copyright 2004-2010 Davide Alberani <da@erlug.linux.it>
|
||||
2008 H. Turgut Uyar <uyar@tekir.org>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
"""
|
||||
|
||||
import re
|
||||
from imdb.utils import analyze_name, build_name
|
||||
from utils import Extractor, Attribute, analyze_imdbid
|
||||
|
||||
from searchMovieParser import DOMHTMLSearchMovieParser, DOMBasicMovieParser
|
||||
|
||||
|
||||
def _cleanName(n):
|
||||
"""Clean the name in a title tag."""
|
||||
if not n:
|
||||
return u''
|
||||
n = n.replace('Filmography by type for', '') # FIXME: temporary.
|
||||
return n
|
||||
|
||||
class DOMBasicPersonParser(DOMBasicMovieParser):
|
||||
"""Simply get the name of a person and the imdbID.
|
||||
|
||||
It's used by the DOMHTMLSearchPersonParser class to return a result
|
||||
for a direct match (when a search on IMDb results in a single
|
||||
person, the web server sends directly the movie page."""
|
||||
_titleFunct = lambda self, x: analyze_name(_cleanName(x), canonical=1)
|
||||
|
||||
|
||||
_reAKASp = re.compile(r'(?:aka|birth name) (<em>")(.*?)"(<br>|<\/em>|<\/td>)',
|
||||
re.I | re.M)
|
||||
|
||||
class DOMHTMLSearchPersonParser(DOMHTMLSearchMovieParser):
|
||||
"""Parse the html page that the IMDb web server shows when the
|
||||
"new search system" is used, for persons."""
|
||||
_BaseParser = DOMBasicPersonParser
|
||||
_notDirectHitTitle = '<title>imdb name'
|
||||
_titleBuilder = lambda self, x: build_name(x, canonical=True)
|
||||
_linkPrefix = '/name/nm'
|
||||
|
||||
_attrs = [Attribute(key='data',
|
||||
multi=True,
|
||||
path={
|
||||
'link': "./a[1]/@href",
|
||||
'name': "./a[1]/text()",
|
||||
'index': "./text()[1]",
|
||||
'akas': ".//div[@class='_imdbpyAKA']/text()"
|
||||
},
|
||||
postprocess=lambda x: (
|
||||
analyze_imdbid(x.get('link') or u''),
|
||||
analyze_name((x.get('name') or u'') + \
|
||||
(x.get('index') or u''),
|
||||
canonical=1), x.get('akas')
|
||||
))]
|
||||
extractors = [Extractor(label='search',
|
||||
path="//td[3]/a[starts-with(@href, '/name/nm')]/..",
|
||||
attrs=_attrs)]
|
||||
|
||||
def preprocess_string(self, html_string):
|
||||
if self._notDirectHitTitle in html_string[:1024].lower():
|
||||
html_string = _reAKASp.sub(
|
||||
r'\1<div class="_imdbpyAKA">\2::</div>\3',
|
||||
html_string)
|
||||
return DOMHTMLSearchMovieParser.preprocess_string(self, html_string)
|
||||
|
||||
|
||||
_OBJECTS = {
|
||||
'search_person_parser': ((DOMHTMLSearchPersonParser,),
|
||||
{'kind': 'person', '_basic_parser': DOMBasicPersonParser})
|
||||
}
|
||||
|
||||
@@ -1,106 +0,0 @@
|
||||
"""
|
||||
parser.http.topBottomParser module (imdb package).
|
||||
|
||||
This module provides the classes (and the instances), used to parse the
|
||||
lists of top 250 and bottom 100 movies.
|
||||
E.g.:
|
||||
http://akas.imdb.com/chart/top
|
||||
http://akas.imdb.com/chart/bottom
|
||||
|
||||
Copyright 2009 Davide Alberani <da@erlug.linux.it>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
"""
|
||||
|
||||
from imdb.utils import analyze_title
|
||||
from utils import DOMParserBase, Attribute, Extractor, analyze_imdbid
|
||||
|
||||
|
||||
class DOMHTMLTop250Parser(DOMParserBase):
|
||||
"""Parser for the "top 250" page.
|
||||
The page should be provided as a string, as taken from
|
||||
the akas.imdb.com server. The final result will be a
|
||||
dictionary, with a key for every relevant section.
|
||||
|
||||
Example:
|
||||
tparser = DOMHTMLTop250Parser()
|
||||
result = tparser.parse(top250_html_string)
|
||||
"""
|
||||
label = 'top 250'
|
||||
ranktext = 'top 250 rank'
|
||||
|
||||
def _init(self):
|
||||
self.extractors = [Extractor(label=self.label,
|
||||
path="//div[@id='main']//table//tr",
|
||||
attrs=Attribute(key=None,
|
||||
multi=True,
|
||||
path={self.ranktext: "./td[1]//text()",
|
||||
'rating': "./td[2]//text()",
|
||||
'title': "./td[3]//text()",
|
||||
'movieID': "./td[3]//a/@href",
|
||||
'votes': "./td[4]//text()"
|
||||
}))]
|
||||
|
||||
def postprocess_data(self, data):
|
||||
if not data or self.label not in data:
|
||||
return []
|
||||
mlist = []
|
||||
data = data[self.label]
|
||||
# Avoid duplicates. A real fix, using XPath, is auspicabile.
|
||||
# XXX: probably this is no more needed.
|
||||
seenIDs = []
|
||||
for d in data:
|
||||
if 'movieID' not in d: continue
|
||||
if self.ranktext not in d: continue
|
||||
if 'title' not in d: continue
|
||||
theID = analyze_imdbid(d['movieID'])
|
||||
if theID is None:
|
||||
continue
|
||||
theID = str(theID)
|
||||
if theID in seenIDs:
|
||||
continue
|
||||
seenIDs.append(theID)
|
||||
minfo = analyze_title(d['title'])
|
||||
try: minfo[self.ranktext] = int(d[self.ranktext].replace('.', ''))
|
||||
except: pass
|
||||
if 'votes' in d:
|
||||
try: minfo['votes'] = int(d['votes'].replace(',', ''))
|
||||
except: pass
|
||||
if 'rating' in d:
|
||||
try: minfo['rating'] = float(d['rating'])
|
||||
except: pass
|
||||
mlist.append((theID, minfo))
|
||||
return mlist
|
||||
|
||||
|
||||
class DOMHTMLBottom100Parser(DOMHTMLTop250Parser):
|
||||
"""Parser for the "bottom 100" page.
|
||||
The page should be provided as a string, as taken from
|
||||
the akas.imdb.com server. The final result will be a
|
||||
dictionary, with a key for every relevant section.
|
||||
|
||||
Example:
|
||||
tparser = DOMHTMLBottom100Parser()
|
||||
result = tparser.parse(bottom100_html_string)
|
||||
"""
|
||||
label = 'bottom 100'
|
||||
ranktext = 'bottom 100 rank'
|
||||
|
||||
|
||||
_OBJECTS = {
|
||||
'top250_parser': ((DOMHTMLTop250Parser,), None),
|
||||
'bottom100_parser': ((DOMHTMLBottom100Parser,), None)
|
||||
}
|
||||
|
||||
@@ -1,855 +0,0 @@
|
||||
"""
|
||||
parser.http.utils module (imdb package).
|
||||
|
||||
This module provides miscellaneous utilities used by
|
||||
the imdb.parser.http classes.
|
||||
|
||||
Copyright 2004-2010 Davide Alberani <da@erlug.linux.it>
|
||||
2008 H. Turgut Uyar <uyar@tekir.org>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
"""
|
||||
|
||||
import re
|
||||
import logging
|
||||
|
||||
from imdb._exceptions import IMDbError
|
||||
|
||||
from imdb.utils import flatten, _Container
|
||||
from imdb.Movie import Movie
|
||||
from imdb.Person import Person
|
||||
from imdb.Character import Character
|
||||
|
||||
|
||||
# Year, imdbIndex and kind.
|
||||
re_yearKind_index = re.compile(r'(\([0-9\?]{4}(?:/[IVXLCDM]+)?\)(?: \(mini\)| \(TV\)| \(V\)| \(VG\))?)')
|
||||
|
||||
# Match imdb ids in href tags
|
||||
re_imdbid = re.compile(r'(title/tt|name/nm|character/ch|company/co)([0-9]+)')
|
||||
|
||||
def analyze_imdbid(href):
|
||||
"""Return an imdbID from an URL."""
|
||||
if not href:
|
||||
return None
|
||||
match = re_imdbid.search(href)
|
||||
if not match:
|
||||
return None
|
||||
return str(match.group(2))
|
||||
|
||||
|
||||
_modify_keys = list(Movie.keys_tomodify_list) + list(Person.keys_tomodify_list)
|
||||
def _putRefs(d, re_titles, re_names, re_characters, lastKey=None):
|
||||
"""Iterate over the strings inside list items or dictionary values,
|
||||
substitutes movie titles and person names with the (qv) references."""
|
||||
if isinstance(d, list):
|
||||
for i in xrange(len(d)):
|
||||
if isinstance(d[i], (unicode, str)):
|
||||
if lastKey in _modify_keys:
|
||||
if re_names:
|
||||
d[i] = re_names.sub(ur"'\1' (qv)", d[i])
|
||||
if re_titles:
|
||||
d[i] = re_titles.sub(ur'_\1_ (qv)', d[i])
|
||||
if re_characters:
|
||||
d[i] = re_characters.sub(ur'#\1# (qv)', d[i])
|
||||
elif isinstance(d[i], (list, dict)):
|
||||
_putRefs(d[i], re_titles, re_names, re_characters,
|
||||
lastKey=lastKey)
|
||||
elif isinstance(d, dict):
|
||||
for k, v in d.items():
|
||||
lastKey = k
|
||||
if isinstance(v, (unicode, str)):
|
||||
if lastKey in _modify_keys:
|
||||
if re_names:
|
||||
d[k] = re_names.sub(ur"'\1' (qv)", v)
|
||||
if re_titles:
|
||||
d[k] = re_titles.sub(ur'_\1_ (qv)', v)
|
||||
if re_characters:
|
||||
d[k] = re_characters.sub(ur'#\1# (qv)', v)
|
||||
elif isinstance(v, (list, dict)):
|
||||
_putRefs(d[k], re_titles, re_names, re_characters,
|
||||
lastKey=lastKey)
|
||||
|
||||
|
||||
# Handle HTML/XML/SGML entities.
|
||||
from htmlentitydefs import entitydefs
|
||||
entitydefs = entitydefs.copy()
|
||||
entitydefsget = entitydefs.get
|
||||
entitydefs['nbsp'] = ' '
|
||||
|
||||
sgmlentity = {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
|
||||
sgmlentityget = sgmlentity.get
|
||||
_sgmlentkeys = sgmlentity.keys()
|
||||
|
||||
entcharrefs = {}
|
||||
entcharrefsget = entcharrefs.get
|
||||
for _k, _v in entitydefs.items():
|
||||
if _k in _sgmlentkeys: continue
|
||||
if _v[0:2] == '&#':
|
||||
dec_code = _v[1:-1]
|
||||
_v = unichr(int(_v[2:-1]))
|
||||
entcharrefs[dec_code] = _v
|
||||
else:
|
||||
dec_code = '#' + str(ord(_v))
|
||||
_v = unicode(_v, 'latin_1', 'replace')
|
||||
entcharrefs[dec_code] = _v
|
||||
entcharrefs[_k] = _v
|
||||
del _sgmlentkeys, _k, _v
|
||||
entcharrefs['#160'] = u' '
|
||||
entcharrefs['#xA0'] = u' '
|
||||
entcharrefs['#xa0'] = u' '
|
||||
entcharrefs['#XA0'] = u' '
|
||||
entcharrefs['#x22'] = u'"'
|
||||
entcharrefs['#X22'] = u'"'
|
||||
# convert &x26; to &, to make BeautifulSoup happy; beware that this
|
||||
# leaves lone '&' in the html broken, but I assume this is better than
|
||||
# the contrary...
|
||||
entcharrefs['#38'] = u'&'
|
||||
entcharrefs['#x26'] = u'&'
|
||||
entcharrefs['#x26'] = u'&'
|
||||
|
||||
re_entcharrefs = re.compile('&(%s|\#160|\#\d{1,5}|\#x[0-9a-f]{1,4});' %
|
||||
'|'.join(map(re.escape, entcharrefs)), re.I)
|
||||
re_entcharrefssub = re_entcharrefs.sub
|
||||
|
||||
sgmlentity.update(dict([('#34', u'"'), ('#38', u'&'),
|
||||
('#60', u'<'), ('#62', u'>'), ('#39', u"'")]))
|
||||
re_sgmlref = re.compile('&(%s);' % '|'.join(map(re.escape, sgmlentity)))
|
||||
re_sgmlrefsub = re_sgmlref.sub
|
||||
|
||||
# Matches XML-only single tags, like <br/> ; they are invalid in HTML,
|
||||
# but widely used by IMDb web site. :-/
|
||||
re_xmltags = re.compile('<([a-zA-Z]+)/>')
|
||||
|
||||
|
||||
def _replXMLRef(match):
|
||||
"""Replace the matched XML/HTML entities and references;
|
||||
replace everything except sgml entities like <, >, ..."""
|
||||
ref = match.group(1)
|
||||
value = entcharrefsget(ref)
|
||||
if value is None:
|
||||
if ref[0] == '#':
|
||||
ref_code = ref[1:]
|
||||
if ref_code in ('34', '38', '60', '62', '39'):
|
||||
return match.group(0)
|
||||
elif ref_code[0].lower() == 'x':
|
||||
#if ref[2:] == '26':
|
||||
# # Don't convert &x26; to &, to make BeautifulSoup happy.
|
||||
# return '&'
|
||||
return unichr(int(ref[2:], 16))
|
||||
else:
|
||||
return unichr(int(ref[1:]))
|
||||
else:
|
||||
return ref
|
||||
return value
|
||||
|
||||
def subXMLRefs(s):
|
||||
"""Return the given html string with entity and char references
|
||||
replaced."""
|
||||
return re_entcharrefssub(_replXMLRef, s)
|
||||
|
||||
# XXX: no more used here; move it to mobile (they are imported by helpers, too)?
|
||||
def _replSGMLRefs(match):
|
||||
"""Replace the matched SGML entity."""
|
||||
ref = match.group(1)
|
||||
return sgmlentityget(ref, ref)
|
||||
|
||||
def subSGMLRefs(s):
|
||||
"""Return the given html string with sgml entity and char references
|
||||
replaced."""
|
||||
return re_sgmlrefsub(_replSGMLRefs, s)
|
||||
|
||||
|
||||
_b_p_logger = logging.getLogger('imdbpy.parser.http.build_person')
|
||||
def build_person(txt, personID=None, billingPos=None,
|
||||
roleID=None, accessSystem='http', modFunct=None):
|
||||
"""Return a Person instance from the tipical <tr>...</tr> strings
|
||||
found in the IMDb's web site."""
|
||||
#if personID is None
|
||||
# _b_p_logger.debug('empty name or personID for "%s"', txt)
|
||||
notes = u''
|
||||
role = u''
|
||||
# Search the (optional) separator between name and role/notes.
|
||||
if txt.find('....') != -1:
|
||||
sep = '....'
|
||||
elif txt.find('...') != -1:
|
||||
sep = '...'
|
||||
else:
|
||||
sep = '...'
|
||||
# Replace the first parenthesis, assuming there are only
|
||||
# notes, after.
|
||||
# Rationale: no imdbIndex is (ever?) showed on the web site.
|
||||
txt = txt.replace('(', '...(', 1)
|
||||
txt_split = txt.split(sep, 1)
|
||||
name = txt_split[0].strip()
|
||||
if len(txt_split) == 2:
|
||||
role_comment = txt_split[1].strip()
|
||||
# Strip common endings.
|
||||
if role_comment[-4:] == ' and':
|
||||
role_comment = role_comment[:-4].rstrip()
|
||||
elif role_comment[-2:] == ' &':
|
||||
role_comment = role_comment[:-2].rstrip()
|
||||
elif role_comment[-6:] == '& ....':
|
||||
role_comment = role_comment[:-6].rstrip()
|
||||
# Get the notes.
|
||||
if roleID is not None:
|
||||
if not isinstance(roleID, list):
|
||||
cmt_idx = role_comment.find('(')
|
||||
if cmt_idx != -1:
|
||||
role = role_comment[:cmt_idx].rstrip()
|
||||
notes = role_comment[cmt_idx:]
|
||||
else:
|
||||
# Just a role, without notes.
|
||||
role = role_comment
|
||||
else:
|
||||
role = role_comment
|
||||
else:
|
||||
# We're managing something that doesn't have a 'role', so
|
||||
# everything are notes.
|
||||
notes = role_comment
|
||||
if role == '....': role = u''
|
||||
roleNotes = []
|
||||
# Manages multiple roleIDs.
|
||||
if isinstance(roleID, list):
|
||||
rolesplit = role.split('/')
|
||||
role = []
|
||||
for r in rolesplit:
|
||||
nidx = r.find('(')
|
||||
if nidx != -1:
|
||||
role.append(r[:nidx].rstrip())
|
||||
roleNotes.append(r[nidx:])
|
||||
else:
|
||||
role.append(r)
|
||||
roleNotes.append(None)
|
||||
lr = len(role)
|
||||
lrid = len(roleID)
|
||||
if lr > lrid:
|
||||
roleID += [None] * (lrid - lr)
|
||||
elif lr < lrid:
|
||||
roleID = roleID[:lr]
|
||||
for i, rid in enumerate(roleID):
|
||||
if rid is not None:
|
||||
roleID[i] = str(rid)
|
||||
if lr == 1:
|
||||
role = role[0]
|
||||
roleID = roleID[0]
|
||||
elif roleID is not None:
|
||||
roleID = str(roleID)
|
||||
if personID is not None:
|
||||
personID = str(personID)
|
||||
if (not name) or (personID is None):
|
||||
# Set to 'debug', since build_person is expected to receive some crap.
|
||||
_b_p_logger.debug('empty name or personID for "%s"', txt)
|
||||
# XXX: return None if something strange is detected?
|
||||
person = Person(name=name, personID=personID, currentRole=role,
|
||||
roleID=roleID, notes=notes, billingPos=billingPos,
|
||||
modFunct=modFunct, accessSystem=accessSystem)
|
||||
if roleNotes and len(roleNotes) == len(roleID):
|
||||
for idx, role in enumerate(person.currentRole):
|
||||
if roleNotes[idx]:
|
||||
role.notes = roleNotes[idx]
|
||||
return person
|
||||
|
||||
|
||||
_re_chrIDs = re.compile('[0-9]{7}')
|
||||
|
||||
_b_m_logger = logging.getLogger('imdbpy.parser.http.build_movie')
|
||||
# To shrink spaces.
|
||||
re_spaces = re.compile(r'\s+')
|
||||
def build_movie(txt, movieID=None, roleID=None, status=None,
|
||||
accessSystem='http', modFunct=None, _parsingCharacter=False,
|
||||
_parsingCompany=False, year=None, chrRoles=None,
|
||||
rolesNoChar=None, additionalNotes=None):
|
||||
"""Given a string as normally seen on the "categorized" page of
|
||||
a person on the IMDb's web site, returns a Movie instance."""
|
||||
# FIXME: Oook, lets face it: build_movie and build_person are now
|
||||
# two horrible sets of patches to support the new IMDb design. They
|
||||
# must be rewritten from scratch.
|
||||
if _parsingCharacter:
|
||||
_defSep = ' Played by '
|
||||
elif _parsingCompany:
|
||||
_defSep = ' ... '
|
||||
else:
|
||||
_defSep = ' .... '
|
||||
title = re_spaces.sub(' ', txt).strip()
|
||||
# Split the role/notes from the movie title.
|
||||
tsplit = title.split(_defSep, 1)
|
||||
role = u''
|
||||
notes = u''
|
||||
roleNotes = []
|
||||
if len(tsplit) == 2:
|
||||
title = tsplit[0].rstrip()
|
||||
role = tsplit[1].lstrip()
|
||||
if title[-9:] == 'TV Series':
|
||||
title = title[:-9].rstrip()
|
||||
elif title[-14:] == 'TV mini-series':
|
||||
title = title[:-14] + ' (mini)'
|
||||
# Try to understand where the movie title ends.
|
||||
while True:
|
||||
if year:
|
||||
break
|
||||
if title[-1:] != ')':
|
||||
# Ignore the silly "TV Series" notice.
|
||||
if title[-9:] == 'TV Series':
|
||||
title = title[:-9].rstrip()
|
||||
continue
|
||||
else:
|
||||
# Just a title: stop here.
|
||||
break
|
||||
# Try to match paired parentheses; yes: sometimes there are
|
||||
# parentheses inside comments...
|
||||
nidx = title.rfind('(')
|
||||
while (nidx != -1 and \
|
||||
title[nidx:].count('(') != title[nidx:].count(')')):
|
||||
nidx = title[:nidx].rfind('(')
|
||||
# Unbalanced parentheses: stop here.
|
||||
if nidx == -1: break
|
||||
# The last item in parentheses seems to be a year: stop here.
|
||||
first4 = title[nidx+1:nidx+5]
|
||||
if (first4.isdigit() or first4 == '????') and \
|
||||
title[nidx+5:nidx+6] in (')', '/'): break
|
||||
# The last item in parentheses is a known kind: stop here.
|
||||
if title[nidx+1:-1] in ('TV', 'V', 'mini', 'VG'): break
|
||||
# Else, in parentheses there are some notes.
|
||||
# XXX: should the notes in the role half be kept separated
|
||||
# from the notes in the movie title half?
|
||||
if notes: notes = '%s %s' % (title[nidx:], notes)
|
||||
else: notes = title[nidx:]
|
||||
title = title[:nidx].rstrip()
|
||||
if year:
|
||||
year = year.strip()
|
||||
if title[-1] == ')':
|
||||
fpIdx = title.rfind('(')
|
||||
if fpIdx != -1:
|
||||
if notes: notes = '%s %s' % (title[fpIdx:], notes)
|
||||
else: notes = title[fpIdx:]
|
||||
title = title[:fpIdx].rstrip()
|
||||
title = u'%s (%s)' % (title, year)
|
||||
if _parsingCharacter and roleID and not role:
|
||||
roleID = None
|
||||
if not roleID:
|
||||
roleID = None
|
||||
elif len(roleID) == 1:
|
||||
roleID = roleID[0]
|
||||
if not role and chrRoles and isinstance(roleID, (str, unicode)):
|
||||
roleID = _re_chrIDs.findall(roleID)
|
||||
role = ' / '.join(filter(None, chrRoles.split('@@')))
|
||||
# Manages multiple roleIDs.
|
||||
if isinstance(roleID, list):
|
||||
tmprole = role.split('/')
|
||||
role = []
|
||||
for r in tmprole:
|
||||
nidx = r.find('(')
|
||||
if nidx != -1:
|
||||
role.append(r[:nidx].rstrip())
|
||||
roleNotes.append(r[nidx:])
|
||||
else:
|
||||
role.append(r)
|
||||
roleNotes.append(None)
|
||||
lr = len(role)
|
||||
lrid = len(roleID)
|
||||
if lr > lrid:
|
||||
roleID += [None] * (lrid - lr)
|
||||
elif lr < lrid:
|
||||
roleID = roleID[:lr]
|
||||
for i, rid in enumerate(roleID):
|
||||
if rid is not None:
|
||||
roleID[i] = str(rid)
|
||||
if lr == 1:
|
||||
role = role[0]
|
||||
roleID = roleID[0]
|
||||
elif roleID is not None:
|
||||
roleID = str(roleID)
|
||||
if movieID is not None:
|
||||
movieID = str(movieID)
|
||||
if (not title) or (movieID is None):
|
||||
_b_m_logger.error('empty title or movieID for "%s"', txt)
|
||||
if rolesNoChar:
|
||||
rolesNoChar = filter(None, [x.strip() for x in rolesNoChar.split('/')])
|
||||
if not role:
|
||||
role = []
|
||||
elif not isinstance(role, list):
|
||||
role = [role]
|
||||
role += rolesNoChar
|
||||
notes = notes.strip()
|
||||
if additionalNotes:
|
||||
additionalNotes = re_spaces.sub(' ', additionalNotes).strip()
|
||||
if notes:
|
||||
notes += u' '
|
||||
notes += additionalNotes
|
||||
m = Movie(title=title, movieID=movieID, notes=notes, currentRole=role,
|
||||
roleID=roleID, roleIsPerson=_parsingCharacter,
|
||||
modFunct=modFunct, accessSystem=accessSystem)
|
||||
if roleNotes and len(roleNotes) == len(roleID):
|
||||
for idx, role in enumerate(m.currentRole):
|
||||
try:
|
||||
if roleNotes[idx]:
|
||||
role.notes = roleNotes[idx]
|
||||
except IndexError:
|
||||
break
|
||||
# Status can't be checked here, and must be detected by the parser.
|
||||
if status:
|
||||
m['status'] = status
|
||||
return m
|
||||
|
||||
|
||||
class DOMParserBase(object):
|
||||
"""Base parser to handle HTML data from the IMDb's web server."""
|
||||
_defGetRefs = False
|
||||
_containsObjects = False
|
||||
|
||||
preprocessors = []
|
||||
extractors = []
|
||||
usingModule = None
|
||||
|
||||
_logger = logging.getLogger('imdbpy.parser.http.domparser')
|
||||
|
||||
def __init__(self, useModule=None):
|
||||
"""Initialize the parser. useModule can be used to force it
|
||||
to use 'BeautifulSoup' or 'lxml'; by default, it's auto-detected,
|
||||
using 'lxml' if available and falling back to 'BeautifulSoup'
|
||||
otherwise."""
|
||||
# Module to use.
|
||||
if useModule is None:
|
||||
useModule = ('lxml', 'BeautifulSoup')
|
||||
if not isinstance(useModule, (tuple, list)):
|
||||
useModule = [useModule]
|
||||
self._useModule = useModule
|
||||
nrMods = len(useModule)
|
||||
_gotError = False
|
||||
for idx, mod in enumerate(useModule):
|
||||
mod = mod.strip().lower()
|
||||
try:
|
||||
if mod == 'lxml':
|
||||
from lxml.html import fromstring
|
||||
from lxml.etree import tostring
|
||||
self._is_xml_unicode = False
|
||||
self.usingModule = 'lxml'
|
||||
elif mod == 'beautifulsoup':
|
||||
from bsouplxml.html import fromstring
|
||||
from bsouplxml.etree import tostring
|
||||
self._is_xml_unicode = True
|
||||
self.usingModule = 'beautifulsoup'
|
||||
else:
|
||||
self._logger.warn('unknown module "%s"' % mod)
|
||||
continue
|
||||
self.fromstring = fromstring
|
||||
self._tostring = tostring
|
||||
if _gotError:
|
||||
self._logger.warn('falling back to "%s"' % mod)
|
||||
break
|
||||
except ImportError, e:
|
||||
if idx+1 >= nrMods:
|
||||
# Raise the exception, if we don't have any more
|
||||
# options to try.
|
||||
raise IMDbError, 'unable to use any parser in %s: %s' % \
|
||||
(str(useModule), str(e))
|
||||
else:
|
||||
self._logger.warn('unable to use "%s": %s' % (mod, str(e)))
|
||||
_gotError = True
|
||||
continue
|
||||
else:
|
||||
raise IMDbError, 'unable to use parsers in %s' % str(useModule)
|
||||
# Fall-back defaults.
|
||||
self._modFunct = None
|
||||
self._as = 'http'
|
||||
self._cname = self.__class__.__name__
|
||||
self._init()
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
"""Reset the parser."""
|
||||
# Names and titles references.
|
||||
self._namesRefs = {}
|
||||
self._titlesRefs = {}
|
||||
self._charactersRefs = {}
|
||||
self._reset()
|
||||
|
||||
def _init(self):
|
||||
"""Subclasses can override this method, if needed."""
|
||||
pass
|
||||
|
||||
def _reset(self):
|
||||
"""Subclasses can override this method, if needed."""
|
||||
pass
|
||||
|
||||
def parse(self, html_string, getRefs=None, **kwds):
|
||||
"""Return the dictionary generated from the given html string;
|
||||
getRefs can be used to force the gathering of movies/persons/characters
|
||||
references."""
|
||||
self.reset()
|
||||
if getRefs is not None:
|
||||
self.getRefs = getRefs
|
||||
else:
|
||||
self.getRefs = self._defGetRefs
|
||||
# Useful only for the testsuite.
|
||||
if not isinstance(html_string, unicode):
|
||||
html_string = unicode(html_string, 'latin_1', 'replace')
|
||||
html_string = subXMLRefs(html_string)
|
||||
# Temporary fix: self.parse_dom must work even for empty strings.
|
||||
html_string = self.preprocess_string(html_string)
|
||||
html_string = html_string.strip()
|
||||
# tag attributes like title=""Family Guy"" will be
|
||||
# converted to title=""Family Guy"" and this confuses BeautifulSoup.
|
||||
if self.usingModule == 'beautifulsoup':
|
||||
html_string = html_string.replace('""', '"')
|
||||
#print html_string.encode('utf8')
|
||||
if html_string:
|
||||
dom = self.get_dom(html_string)
|
||||
#print self.tostring(dom).encode('utf8')
|
||||
try:
|
||||
dom = self.preprocess_dom(dom)
|
||||
except Exception, e:
|
||||
self._logger.error('%s: caught exception preprocessing DOM',
|
||||
self._cname, exc_info=True)
|
||||
if self.getRefs:
|
||||
try:
|
||||
self.gather_refs(dom)
|
||||
except Exception, e:
|
||||
self._logger.warn('%s: unable to gather refs: %s',
|
||||
self._cname, exc_info=True)
|
||||
data = self.parse_dom(dom)
|
||||
else:
|
||||
data = {}
|
||||
try:
|
||||
data = self.postprocess_data(data)
|
||||
except Exception, e:
|
||||
self._logger.error('%s: caught exception postprocessing data',
|
||||
self._cname, exc_info=True)
|
||||
if self._containsObjects:
|
||||
self.set_objects_params(data)
|
||||
data = self.add_refs(data)
|
||||
return data
|
||||
|
||||
def _build_empty_dom(self):
|
||||
from bsouplxml import _bsoup
|
||||
return _bsoup.BeautifulSoup('')
|
||||
|
||||
def get_dom(self, html_string):
|
||||
"""Return a dom object, from the given string."""
|
||||
try:
|
||||
dom = self.fromstring(html_string)
|
||||
if dom is None:
|
||||
dom = self._build_empty_dom()
|
||||
self._logger.error('%s: using a fake empty DOM', self._cname)
|
||||
return dom
|
||||
except Exception, e:
|
||||
self._logger.error('%s: caught exception parsing DOM',
|
||||
self._cname, exc_info=True)
|
||||
return self._build_empty_dom()
|
||||
|
||||
def xpath(self, element, path):
|
||||
"""Return elements matching the given XPath."""
|
||||
try:
|
||||
xpath_result = element.xpath(path)
|
||||
if self._is_xml_unicode:
|
||||
return xpath_result
|
||||
result = []
|
||||
for item in xpath_result:
|
||||
if isinstance(item, str):
|
||||
item = unicode(item)
|
||||
result.append(item)
|
||||
return result
|
||||
except Exception, e:
|
||||
self._logger.error('%s: caught exception extracting XPath "%s"',
|
||||
self._cname, path, exc_info=True)
|
||||
return []
|
||||
|
||||
def tostring(self, element):
|
||||
"""Convert the element to a string."""
|
||||
if isinstance(element, (unicode, str)):
|
||||
return unicode(element)
|
||||
else:
|
||||
try:
|
||||
return self._tostring(element, encoding=unicode)
|
||||
except Exception, e:
|
||||
self._logger.error('%s: unable to convert to string',
|
||||
self._cname, exc_info=True)
|
||||
return u''
|
||||
|
||||
def clone(self, element):
|
||||
"""Clone an element."""
|
||||
return self.fromstring(self.tostring(element))
|
||||
|
||||
def preprocess_string(self, html_string):
|
||||
"""Here we can modify the text, before it's parsed."""
|
||||
if not html_string:
|
||||
return html_string
|
||||
# Remove silly » chars.
|
||||
html_string = html_string.replace(u' \xbb', u'')
|
||||
try:
|
||||
preprocessors = self.preprocessors
|
||||
except AttributeError:
|
||||
return html_string
|
||||
for src, sub in preprocessors:
|
||||
# re._pattern_type is present only since Python 2.5.
|
||||
if callable(getattr(src, 'sub', None)):
|
||||
html_string = src.sub(sub, html_string)
|
||||
elif isinstance(src, str):
|
||||
html_string = html_string.replace(src, sub)
|
||||
elif callable(src):
|
||||
try:
|
||||
html_string = src(html_string)
|
||||
except Exception, e:
|
||||
_msg = '%s: caught exception preprocessing html'
|
||||
self._logger.error(_msg, self._cname, exc_info=True)
|
||||
continue
|
||||
##print html_string.encode('utf8')
|
||||
return html_string
|
||||
|
||||
def gather_refs(self, dom):
|
||||
"""Collect references."""
|
||||
grParser = GatherRefs(useModule=self._useModule)
|
||||
grParser._as = self._as
|
||||
grParser._modFunct = self._modFunct
|
||||
refs = grParser.parse_dom(dom)
|
||||
refs = grParser.postprocess_data(refs)
|
||||
self._namesRefs = refs['names refs']
|
||||
self._titlesRefs = refs['titles refs']
|
||||
self._charactersRefs = refs['characters refs']
|
||||
|
||||
def preprocess_dom(self, dom):
|
||||
"""Last chance to modify the dom, before the rules in self.extractors
|
||||
are applied by the parse_dom method."""
|
||||
return dom
|
||||
|
||||
def parse_dom(self, dom):
|
||||
"""Parse the given dom according to the rules specified
|
||||
in self.extractors."""
|
||||
result = {}
|
||||
for extractor in self.extractors:
|
||||
##print extractor.label
|
||||
if extractor.group is None:
|
||||
elements = [(extractor.label, element)
|
||||
for element in self.xpath(dom, extractor.path)]
|
||||
else:
|
||||
groups = self.xpath(dom, extractor.group)
|
||||
elements = []
|
||||
for group in groups:
|
||||
group_key = self.xpath(group, extractor.group_key)
|
||||
if not group_key: continue
|
||||
group_key = group_key[0]
|
||||
# XXX: always tries the conversion to unicode:
|
||||
# BeautifulSoup.NavigableString is a subclass
|
||||
# of unicode, and so it's never converted.
|
||||
group_key = self.tostring(group_key)
|
||||
normalizer = extractor.group_key_normalize
|
||||
if normalizer is not None:
|
||||
if callable(normalizer):
|
||||
try:
|
||||
group_key = normalizer(group_key)
|
||||
except Exception, e:
|
||||
_m = '%s: unable to apply group_key normalizer'
|
||||
self._logger.error(_m, self._cname,
|
||||
exc_info=True)
|
||||
group_elements = self.xpath(group, extractor.path)
|
||||
elements.extend([(group_key, element)
|
||||
for element in group_elements])
|
||||
for group_key, element in elements:
|
||||
for attr in extractor.attrs:
|
||||
if isinstance(attr.path, dict):
|
||||
data = {}
|
||||
for field in attr.path.keys():
|
||||
path = attr.path[field]
|
||||
value = self.xpath(element, path)
|
||||
if not value:
|
||||
data[field] = None
|
||||
else:
|
||||
# XXX: use u'' , to join?
|
||||
data[field] = ''.join(value)
|
||||
else:
|
||||
data = self.xpath(element, attr.path)
|
||||
if not data:
|
||||
data = None
|
||||
else:
|
||||
data = attr.joiner.join(data)
|
||||
if not data:
|
||||
continue
|
||||
attr_postprocess = attr.postprocess
|
||||
if callable(attr_postprocess):
|
||||
try:
|
||||
data = attr_postprocess(data)
|
||||
except Exception, e:
|
||||
_m = '%s: unable to apply attr postprocess'
|
||||
self._logger.error(_m, self._cname, exc_info=True)
|
||||
key = attr.key
|
||||
if key is None:
|
||||
key = group_key
|
||||
elif key.startswith('.'):
|
||||
# assuming this is an xpath
|
||||
try:
|
||||
key = self.xpath(element, key)[0]
|
||||
except IndexError:
|
||||
self._logger.error('%s: XPath returned no items',
|
||||
self._cname, exc_info=True)
|
||||
elif key.startswith('self.'):
|
||||
key = getattr(self, key[5:])
|
||||
if attr.multi:
|
||||
if key not in result:
|
||||
result[key] = []
|
||||
result[key].append(data)
|
||||
else:
|
||||
if isinstance(data, dict):
|
||||
result.update(data)
|
||||
else:
|
||||
result[key] = data
|
||||
return result
|
||||
|
||||
def postprocess_data(self, data):
|
||||
"""Here we can modify the data."""
|
||||
return data
|
||||
|
||||
def set_objects_params(self, data):
|
||||
"""Set parameters of Movie/Person/... instances, since they are
|
||||
not always set in the parser's code."""
|
||||
for obj in flatten(data, yieldDictKeys=True, scalar=_Container):
|
||||
obj.accessSystem = self._as
|
||||
obj.modFunct = self._modFunct
|
||||
|
||||
def add_refs(self, data):
|
||||
"""Modify data according to the expected output."""
|
||||
if self.getRefs:
|
||||
titl_re = ur'(%s)' % '|'.join([re.escape(x) for x
|
||||
in self._titlesRefs.keys()])
|
||||
if titl_re != ur'()': re_titles = re.compile(titl_re, re.U)
|
||||
else: re_titles = None
|
||||
nam_re = ur'(%s)' % '|'.join([re.escape(x) for x
|
||||
in self._namesRefs.keys()])
|
||||
if nam_re != ur'()': re_names = re.compile(nam_re, re.U)
|
||||
else: re_names = None
|
||||
chr_re = ur'(%s)' % '|'.join([re.escape(x) for x
|
||||
in self._charactersRefs.keys()])
|
||||
if chr_re != ur'()': re_characters = re.compile(chr_re, re.U)
|
||||
else: re_characters = None
|
||||
_putRefs(data, re_titles, re_names, re_characters)
|
||||
return {'data': data, 'titlesRefs': self._titlesRefs,
|
||||
'namesRefs': self._namesRefs,
|
||||
'charactersRefs': self._charactersRefs}
|
||||
|
||||
|
||||
class Extractor(object):
|
||||
"""Instruct the DOM parser about how to parse a document."""
|
||||
def __init__(self, label, path, attrs, group=None, group_key=None,
|
||||
group_key_normalize=None):
|
||||
"""Initialize an Extractor object, used to instruct the DOM parser
|
||||
about how to parse a document."""
|
||||
# rarely (never?) used, mostly for debugging purposes.
|
||||
self.label = label
|
||||
self.group = group
|
||||
if group_key is None:
|
||||
self.group_key = ".//text()"
|
||||
else:
|
||||
self.group_key = group_key
|
||||
self.group_key_normalize = group_key_normalize
|
||||
self.path = path
|
||||
# A list of attributes to fetch.
|
||||
if isinstance(attrs, Attribute):
|
||||
attrs = [attrs]
|
||||
self.attrs = attrs
|
||||
|
||||
def __repr__(self):
|
||||
"""String representation of an Extractor object."""
|
||||
r = '<Extractor id:%s (label=%s, path=%s, attrs=%s, group=%s, ' \
|
||||
'group_key=%s group_key_normalize=%s)>' % (id(self),
|
||||
self.label, self.path, repr(self.attrs), self.group,
|
||||
self.group_key, self.group_key_normalize)
|
||||
return r
|
||||
|
||||
|
||||
class Attribute(object):
|
||||
"""The attribute to consider, for a given node."""
|
||||
def __init__(self, key, multi=False, path=None, joiner=None,
|
||||
postprocess=None):
|
||||
"""Initialize an Attribute object, used to specify the
|
||||
attribute to consider, for a given node."""
|
||||
# The key under which information will be saved; can be a string or an
|
||||
# XPath. If None, the label of the containing extractor will be used.
|
||||
self.key = key
|
||||
self.multi = multi
|
||||
self.path = path
|
||||
if joiner is None:
|
||||
joiner = ''
|
||||
self.joiner = joiner
|
||||
# Post-process this set of information.
|
||||
self.postprocess = postprocess
|
||||
|
||||
def __repr__(self):
|
||||
"""String representation of an Attribute object."""
|
||||
r = '<Attribute id:%s (key=%s, multi=%s, path=%s, joiner=%s, ' \
|
||||
'postprocess=%s)>' % (id(self), self.key,
|
||||
self.multi, repr(self.path),
|
||||
self.joiner, repr(self.postprocess))
|
||||
return r
|
||||
|
||||
|
||||
def _parse_ref(text, link, info):
|
||||
"""Manage links to references."""
|
||||
if link.find('/title/tt') != -1:
|
||||
yearK = re_yearKind_index.match(info)
|
||||
if yearK and yearK.start() == 0:
|
||||
text += ' %s' % info[:yearK.end()]
|
||||
return (text.replace('\n', ' '), link)
|
||||
|
||||
|
||||
class GatherRefs(DOMParserBase):
|
||||
"""Parser used to gather references to movies, persons and characters."""
|
||||
_attrs = [Attribute(key=None, multi=True,
|
||||
path={
|
||||
'text': './text()',
|
||||
'link': './@href',
|
||||
'info': './following::text()[1]'
|
||||
},
|
||||
postprocess=lambda x: _parse_ref(x.get('text'), x.get('link'),
|
||||
(x.get('info') or u'').strip()))]
|
||||
extractors = [
|
||||
Extractor(label='names refs',
|
||||
path="//a[starts-with(@href, '/name/nm')][string-length(@href)=16]",
|
||||
attrs=_attrs),
|
||||
|
||||
Extractor(label='titles refs',
|
||||
path="//a[starts-with(@href, '/title/tt')]" \
|
||||
"[string-length(@href)=17]",
|
||||
attrs=_attrs),
|
||||
|
||||
Extractor(label='characters refs',
|
||||
path="//a[starts-with(@href, '/character/ch')]" \
|
||||
"[string-length(@href)=21]",
|
||||
attrs=_attrs),
|
||||
]
|
||||
|
||||
def postprocess_data(self, data):
|
||||
result = {}
|
||||
for item in ('names refs', 'titles refs', 'characters refs'):
|
||||
result[item] = {}
|
||||
for k, v in data.get(item, []):
|
||||
if not v.endswith('/'): continue
|
||||
imdbID = analyze_imdbid(v)
|
||||
if item == 'names refs':
|
||||
obj = Person(personID=imdbID, name=k,
|
||||
accessSystem=self._as, modFunct=self._modFunct)
|
||||
elif item == 'titles refs':
|
||||
obj = Movie(movieID=imdbID, title=k,
|
||||
accessSystem=self._as, modFunct=self._modFunct)
|
||||
else:
|
||||
obj = Character(characterID=imdbID, name=k,
|
||||
accessSystem=self._as, modFunct=self._modFunct)
|
||||
# XXX: companies aren't handled: are they ever found in text,
|
||||
# as links to their page?
|
||||
result[item][k] = obj
|
||||
return result
|
||||
|
||||
def add_refs(self, data):
|
||||
return data
|
||||
|
||||
|
||||
@@ -1,833 +0,0 @@
|
||||
"""
|
||||
parser.mobile package (imdb package).
|
||||
|
||||
This package provides the IMDbMobileAccessSystem class used to access
|
||||
IMDb's data for mobile systems.
|
||||
the imdb.IMDb function will return an instance of this class when
|
||||
called with the 'accessSystem' argument set to "mobile".
|
||||
|
||||
Copyright 2005-2010 Davide Alberani <da@erlug.linux.it>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
"""
|
||||
|
||||
import re
|
||||
import logging
|
||||
from urllib import unquote
|
||||
|
||||
from imdb import imdbURL_movie_main, imdbURL_person_main, imdbURL_character_main
|
||||
from imdb.Movie import Movie
|
||||
from imdb.utils import analyze_title, analyze_name, canonicalName, \
|
||||
date_and_notes
|
||||
from imdb._exceptions import IMDbDataAccessError
|
||||
from imdb.parser.http import IMDbHTTPAccessSystem
|
||||
from imdb.parser.http.utils import subXMLRefs, subSGMLRefs, build_person, \
|
||||
build_movie, re_spaces
|
||||
|
||||
# XXX NOTE: the first version of this module was heavily based on
|
||||
# regular expressions. This new version replace regexps with
|
||||
# find() strings' method calls; despite being less flexible, it
|
||||
# seems to be at least as fast and, hopefully, much more
|
||||
# lightweight. Yes: the regexp-based version was too heavyweight
|
||||
# for systems with very limited CPU power and memory footprint.
|
||||
re_spacessub = re_spaces.sub
|
||||
# Strip html.
|
||||
re_unhtml = re.compile(r'<.+?>')
|
||||
re_unhtmlsub = re_unhtml.sub
|
||||
# imdb person or movie ids.
|
||||
re_imdbID = re.compile(r'(?<=nm|tt|ch)([0-9]{7})\b')
|
||||
|
||||
# movie AKAs.
|
||||
re_makas = re.compile('(<p class="find-aka">.*?</p>)')
|
||||
|
||||
# Remove episode numbers.
|
||||
re_filmo_episodes = re.compile('<div class="filmo-episodes">.*?</div>',
|
||||
re.M | re.I)
|
||||
|
||||
|
||||
def _unHtml(s):
|
||||
"""Return a string without tags and no multiple spaces."""
|
||||
return subSGMLRefs(re_spacessub(' ', re_unhtmlsub('', s)).strip())
|
||||
|
||||
|
||||
_inttype = type(0)
|
||||
|
||||
def _getTagsWith(s, cont, toClosure=False, maxRes=None):
|
||||
"""Return the html tags in the 's' string containing the 'cont'
|
||||
string; if toClosure is True, everything between the opening
|
||||
tag and the closing tag is returned."""
|
||||
lres = []
|
||||
bi = s.find(cont)
|
||||
if bi != -1:
|
||||
btag = s[:bi].rfind('<')
|
||||
if btag != -1:
|
||||
if not toClosure:
|
||||
etag = s[bi+1:].find('>')
|
||||
if etag != -1:
|
||||
endidx = bi+2+etag
|
||||
lres.append(s[btag:endidx])
|
||||
if maxRes is not None and len(lres) >= maxRes: return lres
|
||||
lres += _getTagsWith(s[endidx:], cont,
|
||||
toClosure=toClosure)
|
||||
else:
|
||||
spaceidx = s[btag:].find(' ')
|
||||
if spaceidx != -1:
|
||||
ctag = '</%s>' % s[btag+1:btag+spaceidx]
|
||||
closeidx = s[bi:].find(ctag)
|
||||
if closeidx != -1:
|
||||
endidx = bi+closeidx+len(ctag)
|
||||
lres.append(s[btag:endidx])
|
||||
if maxRes is not None and len(lres) >= maxRes:
|
||||
return lres
|
||||
lres += _getTagsWith(s[endidx:], cont,
|
||||
toClosure=toClosure)
|
||||
return lres
|
||||
|
||||
|
||||
def _findBetween(s, begins, ends, beginindx=0, maxRes=None, lres=None):
|
||||
"""Return the list of strings from the 's' string which are included
|
||||
between the 'begins' and 'ends' strings."""
|
||||
if lres is None:
|
||||
lres = []
|
||||
bi = s.find(begins, beginindx)
|
||||
if bi != -1:
|
||||
lbegins = len(begins)
|
||||
if isinstance(ends, (list, tuple)):
|
||||
eset = [s.find(end, bi+lbegins) for end in ends]
|
||||
eset[:] = [x for x in eset if x != -1]
|
||||
if not eset: ei = -1
|
||||
else: ei = min(eset)
|
||||
else:
|
||||
ei = s.find(ends, bi+lbegins)
|
||||
if ei != -1:
|
||||
match = s[bi+lbegins:ei]
|
||||
lres.append(match)
|
||||
if maxRes is not None and len(lres) >= maxRes: return lres
|
||||
_findBetween(s, begins, ends, beginindx=ei, maxRes=maxRes,
|
||||
lres=lres)
|
||||
return lres
|
||||
|
||||
|
||||
class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
|
||||
"""The class used to access IMDb's data through the web for
|
||||
mobile terminals."""
|
||||
|
||||
accessSystem = 'mobile'
|
||||
_mobile_logger = logging.getLogger('imdbpy.parser.mobile')
|
||||
|
||||
def __init__(self, isThin=1, *arguments, **keywords):
|
||||
self.accessSystem = 'mobile'
|
||||
IMDbHTTPAccessSystem.__init__(self, isThin, *arguments, **keywords)
|
||||
|
||||
def _clean_html(self, html):
|
||||
"""Normalize the retrieve html."""
|
||||
html = re_spaces.sub(' ', html)
|
||||
# Remove silly » chars.
|
||||
html = html.replace(' »', '')
|
||||
return subXMLRefs(html)
|
||||
|
||||
def _mretrieve(self, url, size=-1):
|
||||
"""Retrieve an html page and normalize it."""
|
||||
cont = self._retrieve(url, size=size)
|
||||
return self._clean_html(cont)
|
||||
|
||||
def _getPersons(self, s, sep='<br/>'):
|
||||
"""Return a list of Person objects, from the string s; items
|
||||
are assumed to be separated by the sep string."""
|
||||
names = s.split(sep)
|
||||
pl = []
|
||||
plappend = pl.append
|
||||
counter = 1
|
||||
for name in names:
|
||||
pid = re_imdbID.findall(name)
|
||||
if not pid: continue
|
||||
characters = _getTagsWith(name, 'class="char"',
|
||||
toClosure=True, maxRes=1)
|
||||
chpids = []
|
||||
if characters:
|
||||
for ch in characters[0].split(' / '):
|
||||
chid = re_imdbID.findall(ch)
|
||||
if not chid:
|
||||
chpids.append(None)
|
||||
else:
|
||||
chpids.append(chid[-1])
|
||||
if not chpids:
|
||||
chpids = None
|
||||
elif len(chpids) == 1:
|
||||
chpids = chpids[0]
|
||||
name = _unHtml(name)
|
||||
# Catch unclosed tags.
|
||||
gt_indx = name.find('>')
|
||||
if gt_indx != -1:
|
||||
name = name[gt_indx+1:].lstrip()
|
||||
if not name: continue
|
||||
if name.endswith('...'):
|
||||
name = name[:-3]
|
||||
p = build_person(name, personID=str(pid[0]), billingPos=counter,
|
||||
modFunct=self._defModFunct, roleID=chpids,
|
||||
accessSystem=self.accessSystem)
|
||||
plappend(p)
|
||||
counter += 1
|
||||
return pl
|
||||
|
||||
def _search_movie(self, title, results):
|
||||
##params = urllib.urlencode({'tt': 'on','mx': str(results),'q': title})
|
||||
##params = 'q=%s&tt=on&mx=%s' % (urllib.quote_plus(title), str(results))
|
||||
##cont = self._mretrieve(imdbURL_search % params)
|
||||
cont = subXMLRefs(self._get_search_content('tt', title, results))
|
||||
title = _findBetween(cont, '<title>', '</title>', maxRes=1)
|
||||
res = []
|
||||
if not title:
|
||||
self._mobile_logger.error('no title tag searching for movie %s',
|
||||
title)
|
||||
return res
|
||||
tl = title[0].lower()
|
||||
if not tl.startswith('imdb title'):
|
||||
# a direct hit!
|
||||
title = _unHtml(title[0])
|
||||
mid = None
|
||||
midtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1)
|
||||
if midtag:
|
||||
mid = _findBetween(midtag[0], '/title/tt', '/', maxRes=1)
|
||||
if not (mid and title):
|
||||
self._mobile_logger.error('no direct hit title/movieID for' \
|
||||
' title %s', title)
|
||||
return res
|
||||
if cont.find('<span class="tv-extra">TV mini-series</span>') != -1:
|
||||
title += ' (mini)'
|
||||
res[:] = [(str(mid[0]), analyze_title(title))]
|
||||
else:
|
||||
# XXX: this results*3 prevents some recursion errors, but...
|
||||
# it's not exactly understandable (i.e.: why 'results' is
|
||||
# not enough to get all the results?)
|
||||
lis = _findBetween(cont, 'td valign="top">', '</td>',
|
||||
maxRes=results*3)
|
||||
for li in lis:
|
||||
akas = re_makas.findall(li)
|
||||
for idx, aka in enumerate(akas):
|
||||
aka = aka.replace('" - ', '::', 1)
|
||||
aka = _unHtml(aka)
|
||||
if aka.startswith('aka "'):
|
||||
aka = aka[5:].strip()
|
||||
if aka[-1] == '"':
|
||||
aka = aka[:-1]
|
||||
akas[idx] = aka
|
||||
imdbid = re_imdbID.findall(li)
|
||||
li = re_makas.sub('', li)
|
||||
mtitle = _unHtml(li)
|
||||
if not (imdbid and mtitle):
|
||||
self._mobile_logger.debug('no title/movieID parsing' \
|
||||
' %s searching for title %s', li,
|
||||
title)
|
||||
continue
|
||||
mtitle = mtitle.replace('(TV mini-series)', '(mini)')
|
||||
resd = analyze_title(mtitle)
|
||||
if akas:
|
||||
resd['akas'] = akas
|
||||
res.append((str(imdbid[0]), resd))
|
||||
return res
|
||||
|
||||
def get_movie_main(self, movieID):
|
||||
cont = self._mretrieve(imdbURL_movie_main % movieID + 'maindetails')
|
||||
title = _findBetween(cont, '<title>', '</title>', maxRes=1)
|
||||
if not title:
|
||||
raise IMDbDataAccessError, 'unable to get movieID "%s"' % movieID
|
||||
title = _unHtml(title[0])
|
||||
if cont.find('<span class="tv-extra">TV mini-series</span>') != -1:
|
||||
title += ' (mini)'
|
||||
d = analyze_title(title)
|
||||
kind = d.get('kind')
|
||||
tv_series = _findBetween(cont, 'TV Series:</h5>', '</a>', maxRes=1)
|
||||
if tv_series: mid = re_imdbID.findall(tv_series[0])
|
||||
else: mid = None
|
||||
if tv_series and mid:
|
||||
s_title = _unHtml(tv_series[0])
|
||||
s_data = analyze_title(s_title)
|
||||
m = Movie(movieID=str(mid[0]), data=s_data,
|
||||
accessSystem=self.accessSystem,
|
||||
modFunct=self._defModFunct)
|
||||
d['kind'] = kind = u'episode'
|
||||
d['episode of'] = m
|
||||
if kind in ('tv series', 'tv mini series'):
|
||||
years = _findBetween(cont, '<h1>', '</h1>', maxRes=1)
|
||||
if years:
|
||||
years[:] = _findBetween(years[0], 'TV series', '</span>',
|
||||
maxRes=1)
|
||||
if years:
|
||||
d['series years'] = years[0].strip()
|
||||
air_date = _findBetween(cont, 'Original Air Date:</h5>', '</div>',
|
||||
maxRes=1)
|
||||
if air_date:
|
||||
air_date = air_date[0]
|
||||
vi = air_date.find('(')
|
||||
if vi != -1:
|
||||
date = _unHtml(air_date[:vi]).strip()
|
||||
if date != '????':
|
||||
d['original air date'] = date
|
||||
air_date = air_date[vi:]
|
||||
season = _findBetween(air_date, 'Season', ',', maxRes=1)
|
||||
if season:
|
||||
season = season[0].strip()
|
||||
try: season = int(season)
|
||||
except: pass
|
||||
if season or type(season) is _inttype:
|
||||
d['season'] = season
|
||||
episode = _findBetween(air_date, 'Episode', ')', maxRes=1)
|
||||
if episode:
|
||||
episode = episode[0].strip()
|
||||
try: episode = int(episode)
|
||||
except: pass
|
||||
if episode or type(season) is _inttype:
|
||||
d['episode'] = episode
|
||||
direct = _findBetween(cont, '<h5>Director', ('</div>', '<br/> <br/>'),
|
||||
maxRes=1)
|
||||
if direct:
|
||||
direct = direct[0]
|
||||
h5idx = direct.find('/h5>')
|
||||
if h5idx != -1:
|
||||
direct = direct[h5idx+4:]
|
||||
direct = self._getPersons(direct)
|
||||
if direct: d['director'] = direct
|
||||
if kind in ('tv series', 'tv mini series', 'episode'):
|
||||
if kind != 'episode':
|
||||
seasons = _findBetween(cont, 'Seasons:</h5>', '</div>',
|
||||
maxRes=1)
|
||||
if seasons:
|
||||
d['number of seasons'] = seasons[0].count('|') + 1
|
||||
creator = _findBetween(cont, 'Created by</h5>', ('class="tn15more"',
|
||||
'</div>',
|
||||
'<br/> <br/>'),
|
||||
maxRes=1)
|
||||
if not creator:
|
||||
# They change 'Created by' to 'Creator' and viceversa
|
||||
# from time to time...
|
||||
# XXX: is 'Creators' also used?
|
||||
creator = _findBetween(cont, 'Creator:</h5>',
|
||||
('class="tn15more"', '</div>',
|
||||
'<br/> <br/>'), maxRes=1)
|
||||
if creator:
|
||||
creator = creator[0]
|
||||
if creator.find('tn15more'): creator = '%s>' % creator
|
||||
creator = self._getPersons(creator)
|
||||
if creator: d['creator'] = creator
|
||||
writers = _findBetween(cont, '<h5>Writer', ('</div>', '<br/> <br/>'),
|
||||
maxRes=1)
|
||||
if writers:
|
||||
writers = writers[0]
|
||||
h5idx = writers.find('/h5>')
|
||||
if h5idx != -1:
|
||||
writers = writers[h5idx+4:]
|
||||
writers = self._getPersons(writers)
|
||||
if writers: d['writer'] = writers
|
||||
cvurl = _getTagsWith(cont, 'name="poster"', toClosure=True, maxRes=1)
|
||||
if cvurl:
|
||||
cvurl = _findBetween(cvurl[0], 'src="', '"', maxRes=1)
|
||||
if cvurl: d['cover url'] = cvurl[0]
|
||||
genres = _findBetween(cont, 'href="/Sections/Genres/', '/')
|
||||
if genres:
|
||||
d['genres'] = list(set(genres))
|
||||
ur = _findBetween(cont, '<div class="starbar-meta">', '</div>',
|
||||
maxRes=1)
|
||||
if ur:
|
||||
rat = _findBetween(ur[0], '<b>', '</b>', maxRes=1)
|
||||
if rat:
|
||||
teni = rat[0].find('/10')
|
||||
if teni != -1:
|
||||
rat = rat[0][:teni]
|
||||
try:
|
||||
rat = float(rat.strip())
|
||||
d['rating'] = rat
|
||||
except ValueError:
|
||||
self._mobile_logger.warn('wrong rating: %s', rat)
|
||||
vi = ur[0].rfind('tn15more">')
|
||||
if vi != -1 and ur[0][vi+10:].find('await') == -1:
|
||||
try:
|
||||
votes = _unHtml(ur[0][vi+10:]).replace('votes', '').strip()
|
||||
votes = int(votes.replace(',', ''))
|
||||
d['votes'] = votes
|
||||
except ValueError:
|
||||
self._mobile_logger.warn('wrong votes: %s', ur)
|
||||
top250 = _findBetween(cont, 'href="/chart/top?', '</a>', maxRes=1)
|
||||
if top250:
|
||||
fn = top250[0].rfind('#')
|
||||
if fn != -1:
|
||||
try:
|
||||
td = int(top250[0][fn+1:])
|
||||
d['top 250 rank'] = td
|
||||
except ValueError:
|
||||
self._mobile_logger.warn('wrong top250: %s', top250)
|
||||
castdata = _findBetween(cont, 'Cast overview', '</table>', maxRes=1)
|
||||
if not castdata:
|
||||
castdata = _findBetween(cont, 'Credited cast', '</table>', maxRes=1)
|
||||
if not castdata:
|
||||
castdata = _findBetween(cont, 'Complete credited cast', '</table>',
|
||||
maxRes=1)
|
||||
if not castdata:
|
||||
castdata = _findBetween(cont, 'Series Cast Summary', '</table>',
|
||||
maxRes=1)
|
||||
if not castdata:
|
||||
castdata = _findBetween(cont, 'Episode Credited cast', '</table>',
|
||||
maxRes=1)
|
||||
if castdata:
|
||||
castdata = castdata[0]
|
||||
# Reintegrate the fist tag.
|
||||
fl = castdata.find('href=')
|
||||
if fl != -1: castdata = '<a ' + castdata[fl:]
|
||||
# Exclude the 'rest of cast listed alphabetically' row.
|
||||
smib = castdata.find('<tr><td align="center" colspan="4"><small>')
|
||||
if smib != -1:
|
||||
smie = castdata.rfind('</small></td></tr>')
|
||||
if smie != -1:
|
||||
castdata = castdata[:smib].strip() + \
|
||||
castdata[smie+18:].strip()
|
||||
castdata = castdata.replace('/tr> <tr', '/tr><tr')
|
||||
cast = self._getPersons(castdata, sep='</tr><tr')
|
||||
if cast: d['cast'] = cast
|
||||
akas = _findBetween(cont, 'Also Known As:</h5>', '</div>', maxRes=1)
|
||||
if akas:
|
||||
# For some reason, here <br> is still used in place of <br/>.
|
||||
akas[:] = [x for x in akas[0].split('<br>') if x.strip()]
|
||||
akas = [_unHtml(x).replace('" - ','::', 1).lstrip('"').strip()
|
||||
for x in akas]
|
||||
if 'See more' in akas: akas.remove('See more')
|
||||
akas[:] = [x for x in akas if x]
|
||||
if akas:
|
||||
d['akas'] = akas
|
||||
mpaa = _findBetween(cont, 'MPAA</a>:', '</div>', maxRes=1)
|
||||
if mpaa: d['mpaa'] = _unHtml(mpaa[0])
|
||||
runtimes = _findBetween(cont, 'Runtime:</h5>', '</div>', maxRes=1)
|
||||
if runtimes:
|
||||
runtimes = runtimes[0]
|
||||
runtimes = [x.strip().replace(' min', '').replace(' (', '::(', 1)
|
||||
for x in runtimes.split('|')]
|
||||
d['runtimes'] = [_unHtml(x).strip() for x in runtimes]
|
||||
if kind == 'episode':
|
||||
# number of episodes.
|
||||
epsn = _findBetween(cont, 'title="Full Episode List">', '</a>',
|
||||
maxRes=1)
|
||||
if epsn:
|
||||
epsn = epsn[0].replace(' Episodes', '').strip()
|
||||
if epsn:
|
||||
try:
|
||||
epsn = int(epsn)
|
||||
except:
|
||||
self._mobile_logger.warn('wrong episodes #: %s', epsn)
|
||||
d['number of episodes'] = epsn
|
||||
country = _findBetween(cont, 'Country:</h5>', '</div>', maxRes=1)
|
||||
if country:
|
||||
country[:] = country[0].split(' | ')
|
||||
country[:] = ['<a %s' % x for x in country if x]
|
||||
country[:] = [_unHtml(x.replace(' <i>', '::')) for x in country]
|
||||
if country: d['countries'] = country
|
||||
lang = _findBetween(cont, 'Language:</h5>', '</div>', maxRes=1)
|
||||
if lang:
|
||||
lang[:] = lang[0].split(' | ')
|
||||
lang[:] = ['<a %s' % x for x in lang if x]
|
||||
lang[:] = [_unHtml(x.replace(' <i>', '::')) for x in lang]
|
||||
if lang: d['languages'] = lang
|
||||
col = _findBetween(cont, '"/search/title?colors=', '</div>')
|
||||
if col:
|
||||
col[:] = col[0].split(' | ')
|
||||
col[:] = ['<a %s' % x for x in col if x]
|
||||
col[:] = [_unHtml(x.replace(' <i>', '::')) for x in col]
|
||||
if col: d['color info'] = col
|
||||
sm = _findBetween(cont, '/search/title?sound_mixes=', '</div>',
|
||||
maxRes=1)
|
||||
if sm:
|
||||
sm[:] = sm[0].split(' | ')
|
||||
sm[:] = ['<a %s' % x for x in sm if x]
|
||||
sm[:] = [_unHtml(x.replace(' <i>', '::')) for x in sm]
|
||||
if sm: d['sound mix'] = sm
|
||||
cert = _findBetween(cont, 'Certification:</h5>', '</div>', maxRes=1)
|
||||
if cert:
|
||||
cert[:] = cert[0].split(' | ')
|
||||
cert[:] = [_unHtml(x.replace(' <i>', '::')) for x in cert]
|
||||
if cert: d['certificates'] = cert
|
||||
plotoutline = _findBetween(cont, 'Plot:</h5>', ['<a ', '</div>'],
|
||||
maxRes=1)
|
||||
if plotoutline:
|
||||
plotoutline = plotoutline[0].strip()
|
||||
plotoutline = plotoutline.rstrip('|').rstrip()
|
||||
if plotoutline: d['plot outline'] = _unHtml(plotoutline)
|
||||
aratio = _findBetween(cont, 'Aspect Ratio:</h5>', ['<a ', '</div>'],
|
||||
maxRes=1)
|
||||
if aratio:
|
||||
aratio = aratio[0].strip().replace(' (', '::(', 1)
|
||||
if aratio:
|
||||
d['aspect ratio'] = _unHtml(aratio)
|
||||
return {'data': d}
|
||||
|
||||
def get_movie_plot(self, movieID):
|
||||
cont = self._mretrieve(imdbURL_movie_main % movieID + 'plotsummary')
|
||||
plot = _findBetween(cont, '<p class="plotpar">', '</p>')
|
||||
plot[:] = [_unHtml(x) for x in plot]
|
||||
for i in xrange(len(plot)):
|
||||
p = plot[i]
|
||||
wbyidx = p.rfind(' Written by ')
|
||||
if wbyidx != -1:
|
||||
plot[i] = '%s::%s' % \
|
||||
(p[:wbyidx].rstrip(),
|
||||
p[wbyidx+12:].rstrip().replace('{','<').replace('}','>'))
|
||||
if plot: return {'data': {'plot': plot}}
|
||||
return {'data': {}}
|
||||
|
||||
def _search_person(self, name, results):
|
||||
##params = urllib.urlencode({'nm': 'on', 'mx': str(results), 'q': name})
|
||||
##params = 'q=%s&nm=on&mx=%s' % (urllib.quote_plus(name), str(results))
|
||||
##cont = self._mretrieve(imdbURL_search % params)
|
||||
cont = subXMLRefs(self._get_search_content('nm', name, results))
|
||||
name = _findBetween(cont, '<title>', '</title>', maxRes=1)
|
||||
res = []
|
||||
if not name:
|
||||
self._mobile_logger.warn('no title tag searching for name %s', name)
|
||||
return res
|
||||
nl = name[0].lower()
|
||||
if not nl.startswith('imdb name'):
|
||||
# a direct hit!
|
||||
name = _unHtml(name[0])
|
||||
name = name.replace('- Filmography by type' , '').strip()
|
||||
pid = None
|
||||
pidtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1)
|
||||
if pidtag:
|
||||
pid = _findBetween(pidtag[0], '/name/nm', '/', maxRes=1)
|
||||
if not (pid and name):
|
||||
self._mobile_logger.error('no direct hit name/personID for' \
|
||||
' name %s', name)
|
||||
return res
|
||||
res[:] = [(str(pid[0]), analyze_name(name, canonical=1))]
|
||||
else:
|
||||
lis = _findBetween(cont, 'td valign="top">', '</td>',
|
||||
maxRes=results*3)
|
||||
for li in lis:
|
||||
akas = _findBetween(li, '<em>"', '"</em>')
|
||||
for sep in ['<small', '<br> aka', '<br> birth name']:
|
||||
sepIdx = li.find(sep)
|
||||
if sepIdx != -1:
|
||||
li = li[:sepIdx]
|
||||
pid = re_imdbID.findall(li)
|
||||
pname = _unHtml(li)
|
||||
if not (pid and pname):
|
||||
self._mobile_logger.debug('no name/personID parsing' \
|
||||
' %s searching for name %s', li,
|
||||
name)
|
||||
continue
|
||||
resd = analyze_name(pname, canonical=1)
|
||||
if akas:
|
||||
resd['akas'] = akas
|
||||
res.append((str(pid[0]), resd))
|
||||
return res
|
||||
|
||||
def get_person_main(self, personID, _parseChr=False):
|
||||
if not _parseChr:
|
||||
url = imdbURL_person_main % personID + 'maindetails'
|
||||
else:
|
||||
url = imdbURL_character_main % personID
|
||||
s = self._mretrieve(url)
|
||||
r = {}
|
||||
name = _findBetween(s, '<title>', '</title>', maxRes=1)
|
||||
if not name:
|
||||
if _parseChr: w = 'characterID'
|
||||
else: w = 'personID'
|
||||
raise IMDbDataAccessError, 'unable to get %s "%s"' % (w, personID)
|
||||
name = _unHtml(name[0].replace(' - IMDb', ''))
|
||||
if _parseChr:
|
||||
name = name.replace('(Character)', '').strip()
|
||||
name = name.replace('- Filmography by type', '').strip()
|
||||
else:
|
||||
name = name.replace('- Filmography by', '').strip()
|
||||
r = analyze_name(name, canonical=not _parseChr)
|
||||
for dKind in ('Born', 'Died'):
|
||||
date = _findBetween(s, '%s:</h4>' % dKind.capitalize(),
|
||||
('<div class', '</div>', '<br/><br/>'), maxRes=1)
|
||||
if date:
|
||||
date = _unHtml(date[0])
|
||||
if date:
|
||||
#date, notes = date_and_notes(date)
|
||||
# TODO: fix to handle real names.
|
||||
date_notes = date.split(' in ', 1)
|
||||
notes = u''
|
||||
date = date_notes[0]
|
||||
if len(date_notes) == 2:
|
||||
notes = date_notes[1]
|
||||
dtitle = 'birth'
|
||||
if dKind == 'Died':
|
||||
dtitle = 'death'
|
||||
if date:
|
||||
r['%s date' % dtitle] = date
|
||||
if notes:
|
||||
r['%s notes' % dtitle] = notes
|
||||
akas = _findBetween(s, 'Alternate Names:</h5>', ('</div>',
|
||||
'<br/><br/>'), maxRes=1)
|
||||
if akas:
|
||||
akas = akas[0]
|
||||
if akas.find(' | ') != -1:
|
||||
akas = _unHtml(akas).split(' | ')
|
||||
else:
|
||||
akas = _unHtml(akas).split(' / ')
|
||||
if akas: r['akas'] = akas
|
||||
hs = _findBetween(s, 'name="headshot"', '</a>', maxRes=1)
|
||||
if hs:
|
||||
hs[:] = _findBetween(hs[0], 'src="', '"', maxRes=1)
|
||||
if hs: r['headshot'] = hs[0]
|
||||
# Build a list of tuples such [('hrefLink', 'section name')]
|
||||
workkind = _findBetween(s, 'id="jumpto_', '</a>')
|
||||
ws = []
|
||||
for work in workkind:
|
||||
sep = '" >'
|
||||
if '">' in work:
|
||||
sep = '">'
|
||||
wsplit = work.split(sep, 1)
|
||||
if len(wsplit) == 2:
|
||||
sect = wsplit[0]
|
||||
if '"' in sect:
|
||||
sect = sect[:sect.find('"')]
|
||||
ws.append((sect, wsplit[1].lower()))
|
||||
# XXX: I think "guest appearances" are gone.
|
||||
if s.find('<a href="#guest-appearances"') != -1:
|
||||
ws.append(('guest-appearances', 'notable tv guest appearances'))
|
||||
#if _parseChr:
|
||||
# ws.append(('filmography', 'filmography'))
|
||||
for sect, sectName in ws:
|
||||
raws = u''
|
||||
# Everything between the current section link and the end
|
||||
# of the <ol> tag.
|
||||
if _parseChr and sect == 'filmography':
|
||||
inisect = s.find('<div class="filmo">')
|
||||
else:
|
||||
inisect = s.find('<a name="%s' % sect)
|
||||
if inisect != -1:
|
||||
endsect = s[inisect:].find('<div id="filmo-head-')
|
||||
if endsect != -1: raws = s[inisect:inisect+endsect]
|
||||
if not raws: continue
|
||||
mlist = _findBetween(raws, '<div class="filmo-row',
|
||||
('<div class="clear"/>',))
|
||||
for m in mlist:
|
||||
fCB = m.find('>')
|
||||
if fCB != -1:
|
||||
m = m[fCB+1:].lstrip()
|
||||
m = re_filmo_episodes.sub('', m)
|
||||
# For every movie in the current section.
|
||||
movieID = re_imdbID.findall(m)
|
||||
if not movieID:
|
||||
self._mobile_logger.debug('no movieID in %s', m)
|
||||
continue
|
||||
m = m.replace('<br/>', ' .... ', 1)
|
||||
if not _parseChr:
|
||||
chrIndx = m.find(' .... ')
|
||||
else:
|
||||
chrIndx = m.find(' Played by ')
|
||||
chids = []
|
||||
if chrIndx != -1:
|
||||
chrtxt = m[chrIndx+6:]
|
||||
if _parseChr:
|
||||
chrtxt = chrtxt[5:]
|
||||
for ch in chrtxt.split(' / '):
|
||||
chid = re_imdbID.findall(ch)
|
||||
if not chid:
|
||||
chids.append(None)
|
||||
else:
|
||||
chids.append(chid[-1])
|
||||
if not chids:
|
||||
chids = None
|
||||
elif len(chids) == 1:
|
||||
chids = chids[0]
|
||||
movieID = str(movieID[0])
|
||||
# Search the status.
|
||||
stidx = m.find('<i>')
|
||||
status = u''
|
||||
if stidx != -1:
|
||||
stendidx = m.rfind('</i>')
|
||||
if stendidx != -1:
|
||||
status = _unHtml(m[stidx+3:stendidx])
|
||||
m = m.replace(m[stidx+3:stendidx], '')
|
||||
year = _findBetween(m, 'year_column">', '</span>', maxRes=1)
|
||||
if year:
|
||||
year = year[0]
|
||||
m = m.replace('<span class="year_column">%s</span>' % year,
|
||||
'')
|
||||
else:
|
||||
year = None
|
||||
m = _unHtml(m)
|
||||
if not m:
|
||||
self._mobile_logger.warn('no title for movieID %s', movieID)
|
||||
continue
|
||||
movie = build_movie(m, movieID=movieID, status=status,
|
||||
roleID=chids, modFunct=self._defModFunct,
|
||||
accessSystem=self.accessSystem,
|
||||
_parsingCharacter=_parseChr, year=year)
|
||||
sectName = sectName.split(':')[0]
|
||||
r.setdefault(sectName, []).append(movie)
|
||||
# If available, take the always correct name from a form.
|
||||
itag = _getTagsWith(s, 'NAME="primary"', maxRes=1)
|
||||
if not itag:
|
||||
itag = _getTagsWith(s, 'name="primary"', maxRes=1)
|
||||
if itag:
|
||||
vtag = _findBetween(itag[0], 'VALUE="', ('"', '>'), maxRes=1)
|
||||
if not vtag:
|
||||
vtag = _findBetween(itag[0], 'value="', ('"', '>'), maxRes=1)
|
||||
if vtag:
|
||||
try:
|
||||
vtag = unquote(str(vtag[0]))
|
||||
vtag = unicode(vtag, 'latin_1')
|
||||
r.update(analyze_name(vtag))
|
||||
except UnicodeEncodeError:
|
||||
pass
|
||||
return {'data': r, 'info sets': ('main', 'filmography')}
|
||||
|
||||
def get_person_biography(self, personID):
|
||||
cont = self._mretrieve(imdbURL_person_main % personID + 'bio')
|
||||
d = {}
|
||||
spouses = _findBetween(cont, 'Spouse</h5>', ('</table>', '</dd>'),
|
||||
maxRes=1)
|
||||
if spouses:
|
||||
sl = []
|
||||
for spouse in spouses[0].split('</tr>'):
|
||||
if spouse.count('</td>') > 1:
|
||||
spouse = spouse.replace('</td>', '::</td>', 1)
|
||||
spouse = _unHtml(spouse)
|
||||
spouse = spouse.replace(':: ', '::').strip()
|
||||
if spouse: sl.append(spouse)
|
||||
if sl: d['spouse'] = sl
|
||||
nnames = _findBetween(cont, '<h5>Nickname</h5>', ('<br/> <br/>','<h5>'),
|
||||
maxRes=1)
|
||||
if nnames:
|
||||
nnames = nnames[0]
|
||||
if nnames:
|
||||
nnames = [x.strip().replace(' (', '::(', 1)
|
||||
for x in nnames.split('<br/>')]
|
||||
if nnames:
|
||||
d['nick names'] = nnames
|
||||
misc_sects = _findBetween(cont, '<h5>', '<br/>')
|
||||
misc_sects[:] = [x.split('</h5>') for x in misc_sects]
|
||||
misc_sects[:] = [x for x in misc_sects if len(x) == 2]
|
||||
for sect, data in misc_sects:
|
||||
sect = sect.lower().replace(':', '').strip()
|
||||
if d.has_key(sect) and sect != 'mini biography': continue
|
||||
elif sect in ('spouse', 'nickname'): continue
|
||||
if sect == 'salary': sect = 'salary history'
|
||||
elif sect == 'where are they now': sect = 'where now'
|
||||
elif sect == 'personal quotes': sect = 'quotes'
|
||||
data = data.replace('</p><p>', '::')
|
||||
data = data.replace('<br><br>', ' ') # for multi-paragraphs 'bio'
|
||||
data = data.replace('</td> <td valign="top">', '@@@@')
|
||||
data = data.replace('</td> </tr>', '::')
|
||||
data = _unHtml(data)
|
||||
data = [x.strip() for x in data.split('::')]
|
||||
data[:] = [x.replace('@@@@', '::') for x in data if x]
|
||||
if sect == 'height' and data: data = data[0]
|
||||
elif sect == 'birth name': data = canonicalName(data[0])
|
||||
elif sect == 'date of birth':
|
||||
date, notes = date_and_notes(data[0])
|
||||
if date:
|
||||
d['birth date'] = date
|
||||
if notes:
|
||||
d['birth notes'] = notes
|
||||
continue
|
||||
elif sect == 'date of death':
|
||||
date, notes = date_and_notes(data[0])
|
||||
if date:
|
||||
d['death date'] = date
|
||||
if notes:
|
||||
d['death notes'] = notes
|
||||
continue
|
||||
elif sect == 'mini biography':
|
||||
ndata = []
|
||||
for bio in data:
|
||||
byidx = bio.rfind('IMDb Mini Biography By')
|
||||
if byidx != -1:
|
||||
bioAuth = bio[:byidx].rstrip()
|
||||
else:
|
||||
bioAuth = 'Anonymous'
|
||||
bio = u'%s::%s' % (bioAuth, bio[byidx+23:].lstrip())
|
||||
ndata.append(bio)
|
||||
data[:] = ndata
|
||||
if 'mini biography' in d:
|
||||
d['mini biography'].append(ndata[0])
|
||||
continue
|
||||
d[sect] = data
|
||||
return {'data': d}
|
||||
|
||||
def _search_character(self, name, results):
|
||||
cont = subXMLRefs(self._get_search_content('char', name, results))
|
||||
name = _findBetween(cont, '<title>', '</title>', maxRes=1)
|
||||
res = []
|
||||
if not name:
|
||||
self._mobile_logger.error('no title tag searching character %s',
|
||||
name)
|
||||
return res
|
||||
nl = name[0].lower()
|
||||
if not (nl.startswith('imdb search') or nl.startswith('imdb search') \
|
||||
or nl.startswith('imdb character')):
|
||||
# a direct hit!
|
||||
name = _unHtml(name[0]).replace('(Character)', '').strip()
|
||||
pid = None
|
||||
pidtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1)
|
||||
if pidtag:
|
||||
pid = _findBetween(pidtag[0], '/character/ch', '/', maxRes=1)
|
||||
if not (pid and name):
|
||||
self._mobile_logger.error('no direct hit name/characterID for' \
|
||||
' character %s', name)
|
||||
return res
|
||||
res[:] = [(str(pid[0]), analyze_name(name))]
|
||||
else:
|
||||
sects = _findBetween(cont, '<b>Popular Characters</b>', '</table>',
|
||||
maxRes=results*3)
|
||||
sects += _findBetween(cont, '<b>Characters', '</table>',
|
||||
maxRes=results*3)
|
||||
for sect in sects:
|
||||
lis = _findBetween(sect, '<a href="/character/',
|
||||
['<small', '</td>', '<br'])
|
||||
for li in lis:
|
||||
li = '<%s' % li
|
||||
pid = re_imdbID.findall(li)
|
||||
pname = _unHtml(li)
|
||||
if not (pid and pname):
|
||||
self._mobile_logger.debug('no name/characterID' \
|
||||
' parsing %s searching for' \
|
||||
' character %s', li, name)
|
||||
continue
|
||||
res.append((str(pid[0]), analyze_name(pname)))
|
||||
return res
|
||||
|
||||
def get_character_main(self, characterID):
|
||||
return self.get_person_main(characterID, _parseChr=True)
|
||||
|
||||
def get_character_biography(self, characterID):
|
||||
cont = self._mretrieve(imdbURL_character_main % characterID + 'bio')
|
||||
d = {}
|
||||
intro = _findBetween(cont, '<div class="display">',
|
||||
('<span>', '<h4>'), maxRes=1)
|
||||
if intro:
|
||||
intro = _unHtml(intro[0]).strip()
|
||||
if intro:
|
||||
d['introduction'] = intro
|
||||
bios = _findBetween(cont, '<div class="display">',
|
||||
'<div class="history">')
|
||||
if bios:
|
||||
bios = _findBetween(bios[0], '<h4>', ('<h4>', '</div>'))
|
||||
if bios:
|
||||
for bio in bios:
|
||||
bio = bio.replace('</h4>', '::')
|
||||
bio = bio.replace('\n', ' ')
|
||||
bio = bio.replace('<br>', '\n')
|
||||
bio = bio.replace('<br/>', '\n')
|
||||
bio = subSGMLRefs(re_unhtmlsub('', bio).strip())
|
||||
bio = bio.replace(' ::', '::').replace(':: ', '::')
|
||||
bio = bio.replace('::', ': ', 1)
|
||||
if bio:
|
||||
d.setdefault('biography', []).append(bio)
|
||||
return {'data': d}
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,508 +0,0 @@
|
||||
"""
|
||||
parser.sql.alchemyadapter module (imdb.parser.sql package).
|
||||
|
||||
This module adapts the SQLAlchemy ORM to the internal mechanism.
|
||||
|
||||
Copyright 2008-2010 Davide Alberani <da@erlug.linux.it>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
import logging
|
||||
from sqlalchemy import *
|
||||
from sqlalchemy import schema
|
||||
try: from sqlalchemy import exc # 0.5
|
||||
except ImportError: from sqlalchemy import exceptions as exc # 0.4
|
||||
|
||||
_alchemy_logger = logging.getLogger('imdbpy.parser.sql.alchemy')
|
||||
|
||||
try:
|
||||
import migrate.changeset
|
||||
HAS_MC = True
|
||||
except ImportError:
|
||||
HAS_MC = False
|
||||
_alchemy_logger.warn('Unable to import migrate.changeset: Foreign ' \
|
||||
'Keys will not be created.')
|
||||
|
||||
from imdb._exceptions import IMDbDataAccessError
|
||||
from dbschema import *
|
||||
|
||||
# Used to convert table and column names.
|
||||
re_upper = re.compile(r'([A-Z])')
|
||||
|
||||
# XXX: I'm not sure at all that this is the best method to connect
|
||||
# to the database and bind that connection to every table.
|
||||
metadata = MetaData()
|
||||
|
||||
# Maps our placeholders to SQLAlchemy's column types.
|
||||
MAP_COLS = {
|
||||
INTCOL: Integer,
|
||||
UNICODECOL: UnicodeText,
|
||||
STRINGCOL: String
|
||||
}
|
||||
|
||||
|
||||
class NotFoundError(IMDbDataAccessError):
|
||||
"""Exception raised when Table.get(id) returns no value."""
|
||||
pass
|
||||
|
||||
|
||||
def _renameTable(tname):
|
||||
"""Build the name of a table, as done by SQLObject."""
|
||||
tname = re_upper.sub(r'_\1', tname)
|
||||
if tname.startswith('_'):
|
||||
tname = tname[1:]
|
||||
return tname.lower()
|
||||
|
||||
def _renameColumn(cname):
|
||||
"""Build the name of a column, as done by SQLObject."""
|
||||
cname = cname.replace('ID', 'Id')
|
||||
return _renameTable(cname)
|
||||
|
||||
|
||||
class DNNameObj(object):
|
||||
"""Used to access table.sqlmeta.columns[column].dbName (a string)."""
|
||||
def __init__(self, dbName):
|
||||
self.dbName = dbName
|
||||
|
||||
def __repr__(self):
|
||||
return '<DNNameObj(dbName=%s) [id=%s]>' % (self.dbName, id(self))
|
||||
|
||||
|
||||
class DNNameDict(object):
|
||||
"""Used to access table.sqlmeta.columns (a dictionary)."""
|
||||
def __init__(self, colMap):
|
||||
self.colMap = colMap
|
||||
|
||||
def __getitem__(self, key):
|
||||
return DNNameObj(self.colMap[key])
|
||||
|
||||
def __repr__(self):
|
||||
return '<DNNameDict(colMap=%s) [id=%s]>' % (self.colMap, id(self))
|
||||
|
||||
|
||||
class SQLMetaAdapter(object):
|
||||
"""Used to access table.sqlmeta (an object with .table, .columns and
|
||||
.idName attributes)."""
|
||||
def __init__(self, table, colMap=None):
|
||||
self.table = table
|
||||
if colMap is None:
|
||||
colMap = {}
|
||||
self.colMap = colMap
|
||||
|
||||
def __getattr__(self, name):
|
||||
if name == 'table':
|
||||
return getattr(self.table, name)
|
||||
if name == 'columns':
|
||||
return DNNameDict(self.colMap)
|
||||
if name == 'idName':
|
||||
return self.colMap.get('id', 'id')
|
||||
return None
|
||||
|
||||
def __repr__(self):
|
||||
return '<SQLMetaAdapter(table=%s, colMap=%s) [id=%s]>' % \
|
||||
(repr(self.table), repr(self.colMap), id(self))
|
||||
|
||||
|
||||
class QAdapter(object):
|
||||
"""Used to access table.q attribute (remapped to SQLAlchemy table.c)."""
|
||||
def __init__(self, table, colMap=None):
|
||||
self.table = table
|
||||
if colMap is None:
|
||||
colMap = {}
|
||||
self.colMap = colMap
|
||||
|
||||
def __getattr__(self, name):
|
||||
try: return getattr(self.table.c, self.colMap[name])
|
||||
except KeyError, e: raise AttributeError, "unable to get '%s'" % name
|
||||
|
||||
def __repr__(self):
|
||||
return '<QAdapter(table=%s, colMap=%s) [id=%s]>' % \
|
||||
(repr(self.table), repr(self.colMap), id(self))
|
||||
|
||||
|
||||
class RowAdapter(object):
|
||||
"""Adapter for a SQLAlchemy RowProxy object."""
|
||||
def __init__(self, row, table, colMap=None):
|
||||
self.row = row
|
||||
# FIXME: it's OBSCENE that 'table' should be passed from
|
||||
# TableAdapter through ResultAdapter only to land here,
|
||||
# where it's used to directly update a row item.
|
||||
self.table = table
|
||||
if colMap is None:
|
||||
colMap = {}
|
||||
self.colMap = colMap
|
||||
self.colMapKeys = colMap.keys()
|
||||
|
||||
def __getattr__(self, name):
|
||||
try: return getattr(self.row, self.colMap[name])
|
||||
except KeyError, e: raise AttributeError, "unable to get '%s'" % name
|
||||
|
||||
def __setattr__(self, name, value):
|
||||
# FIXME: I can't even think about how much performances suffer,
|
||||
# for this horrible hack (and it's used so rarely...)
|
||||
# For sure something like a "property" to map column names
|
||||
# to getter/setter functions would be much better, but it's
|
||||
# not possible (or at least not easy) to build them for a
|
||||
# single instance.
|
||||
if name in self.__dict__.get('colMapKeys', ()):
|
||||
# Trying to update a value in the database.
|
||||
row = self.__dict__['row']
|
||||
table = self.__dict__['table']
|
||||
colMap = self.__dict__['colMap']
|
||||
params = {colMap[name]: value}
|
||||
table.update(table.c.id==row.id).execute(**params)
|
||||
# XXX: minor bug: after a value is assigned with the
|
||||
# 'rowAdapterInstance.colName = value' syntax, for some
|
||||
# reason rowAdapterInstance.colName still returns the
|
||||
# previous value (even if the database is updated).
|
||||
# Fix it? I'm not even sure it's ever used.
|
||||
return
|
||||
# For every other attribute.
|
||||
object.__setattr__(self, name, value)
|
||||
|
||||
def __repr__(self):
|
||||
return '<RowAdapter(row=%s, table=%s, colMap=%s) [id=%s]>' % \
|
||||
(repr(self.row), repr(self.table), repr(self.colMap), id(self))
|
||||
|
||||
|
||||
class ResultAdapter(object):
|
||||
"""Adapter for a SQLAlchemy ResultProxy object."""
|
||||
def __init__(self, result, table, colMap=None):
|
||||
self.result = result
|
||||
self.table = table
|
||||
if colMap is None:
|
||||
colMap = {}
|
||||
self.colMap = colMap
|
||||
|
||||
def count(self):
|
||||
return len(self)
|
||||
|
||||
def __len__(self):
|
||||
# FIXME: why sqlite returns -1? (that's wrooong!)
|
||||
if self.result.rowcount == -1:
|
||||
return 0
|
||||
return self.result.rowcount
|
||||
|
||||
def __getitem__(self, key):
|
||||
res = list(self.result)[key]
|
||||
if not isinstance(key, slice):
|
||||
# A single item.
|
||||
return RowAdapter(res, self.table, colMap=self.colMap)
|
||||
else:
|
||||
# A (possible empty) list of items.
|
||||
return [RowAdapter(x, self.table, colMap=self.colMap)
|
||||
for x in res]
|
||||
|
||||
def __iter__(self):
|
||||
for item in self.result:
|
||||
yield RowAdapter(item, self.table, colMap=self.colMap)
|
||||
|
||||
def __repr__(self):
|
||||
return '<ResultAdapter(result=%s, table=%s, colMap=%s) [id=%s]>' % \
|
||||
(repr(self.result), repr(self.table),
|
||||
repr(self.colMap), id(self))
|
||||
|
||||
|
||||
class TableAdapter(object):
|
||||
"""Adapter for a SQLAlchemy Table object, to mimic a SQLObject class."""
|
||||
def __init__(self, table, uri=None):
|
||||
"""Initialize a TableAdapter object."""
|
||||
self._imdbpySchema = table
|
||||
self._imdbpyName = table.name
|
||||
self.connectionURI = uri
|
||||
self.colMap = {}
|
||||
columns = []
|
||||
for col in table.cols:
|
||||
# Column's paramters.
|
||||
params = {'nullable': True}
|
||||
params.update(col.params)
|
||||
if col.name == 'id':
|
||||
params['primary_key'] = True
|
||||
if 'notNone' in params:
|
||||
params['nullable'] = not params['notNone']
|
||||
del params['notNone']
|
||||
cname = _renameColumn(col.name)
|
||||
self.colMap[col.name] = cname
|
||||
colClass = MAP_COLS[col.kind]
|
||||
colKindParams = {}
|
||||
if 'length' in params:
|
||||
colKindParams['length'] = params['length']
|
||||
del params['length']
|
||||
elif colClass is UnicodeText and col.index:
|
||||
# XXX: limit length for UNICODECOLs that will have an index.
|
||||
# this can result in name.name and title.title truncations!
|
||||
colClass = Unicode
|
||||
# Should work for most of the database servers.
|
||||
length = 511
|
||||
if self.connectionURI:
|
||||
if self.connectionURI.startswith('mysql'):
|
||||
# To stay compatible with MySQL 4.x.
|
||||
length = 255
|
||||
colKindParams['length'] = length
|
||||
elif self._imdbpyName == 'PersonInfo' and col.name == 'info':
|
||||
if self.connectionURI:
|
||||
if self.connectionURI.startswith('ibm'):
|
||||
# There are some entries longer than 32KB.
|
||||
colClass = CLOB
|
||||
# I really do hope that this space isn't wasted
|
||||
# for each other shorter entry... <g>
|
||||
colKindParams['length'] = 68*1024
|
||||
colKind = colClass(**colKindParams)
|
||||
if 'alternateID' in params:
|
||||
# There's no need to handle them here.
|
||||
del params['alternateID']
|
||||
# Create a column.
|
||||
colObj = Column(cname, colKind, **params)
|
||||
columns.append(colObj)
|
||||
self.tableName = _renameTable(table.name)
|
||||
# Create the table.
|
||||
self.table = Table(self.tableName, metadata, *columns)
|
||||
self._ta_insert = self.table.insert()
|
||||
self._ta_select = self.table.select
|
||||
# Adapters for special attributes.
|
||||
self.q = QAdapter(self.table, colMap=self.colMap)
|
||||
self.sqlmeta = SQLMetaAdapter(self.table, colMap=self.colMap)
|
||||
|
||||
def select(self, conditions=None):
|
||||
"""Return a list of results."""
|
||||
result = self._ta_select(conditions).execute()
|
||||
return ResultAdapter(result, self.table, colMap=self.colMap)
|
||||
|
||||
def get(self, theID):
|
||||
"""Get an object given its ID."""
|
||||
result = self.select(self.table.c.id == theID)
|
||||
#if not result:
|
||||
# raise NotFoundError, 'no data for ID %s' % theID
|
||||
# FIXME: isn't this a bit risky? We can't check len(result),
|
||||
# because sqlite returns -1...
|
||||
# What about converting it to a list and getting the first item?
|
||||
try:
|
||||
return result[0]
|
||||
except KeyError:
|
||||
raise NotFoundError, 'no data for ID %s' % theID
|
||||
|
||||
def dropTable(self, checkfirst=True):
|
||||
"""Drop the table."""
|
||||
dropParams = {'checkfirst': checkfirst}
|
||||
# Guess what? Another work-around for a ibm_db bug.
|
||||
if self.table.bind.engine.url.drivername.startswith('ibm_db'):
|
||||
del dropParams['checkfirst']
|
||||
try:
|
||||
self.table.drop(**dropParams)
|
||||
except exc.ProgrammingError:
|
||||
# As above: re-raise the exception, but only if it's not ibm_db.
|
||||
if not self.table.bind.engine.url.drivername.startswith('ibm_db'):
|
||||
raise
|
||||
|
||||
def createTable(self, checkfirst=True):
|
||||
"""Create the table."""
|
||||
self.table.create(checkfirst=checkfirst)
|
||||
# Create indexes for alternateID columns (other indexes will be
|
||||
# created later, at explicit request for performances reasons).
|
||||
for col in self._imdbpySchema.cols:
|
||||
if col.name == 'id':
|
||||
continue
|
||||
if col.params.get('alternateID', False):
|
||||
self._createIndex(col, checkfirst=checkfirst)
|
||||
|
||||
def _createIndex(self, col, checkfirst=True):
|
||||
"""Create an index for a given (schema) column."""
|
||||
# XXX: indexLen is ignored in SQLAlchemy, and that means that
|
||||
# indexes will be over the whole 255 chars strings...
|
||||
# NOTE: don't use a dot as a separator, or DB2 will do
|
||||
# nasty things.
|
||||
idx_name = '%s_%s' % (self.table.name, col.index or col.name)
|
||||
if checkfirst:
|
||||
for index in self.table.indexes:
|
||||
if index.name == idx_name:
|
||||
return
|
||||
idx = Index(idx_name, getattr(self.table.c, self.colMap[col.name]))
|
||||
# XXX: beware that exc.OperationalError can be raised, is some
|
||||
# strange circumstances; that's why the index name doesn't
|
||||
# follow the SQLObject convention, but includes the table name:
|
||||
# sqlite, for example, expects index names to be unique at
|
||||
# db-level.
|
||||
try:
|
||||
idx.create()
|
||||
except exc.OperationalError, e:
|
||||
_alchemy_logger.warn('Skipping creation of the %s.%s index: %s' %
|
||||
(self.sqlmeta.table, col.name, e))
|
||||
|
||||
def addIndexes(self, ifNotExists=True):
|
||||
"""Create all required indexes."""
|
||||
for col in self._imdbpySchema.cols:
|
||||
if col.index:
|
||||
self._createIndex(col, checkfirst=ifNotExists)
|
||||
|
||||
def addForeignKeys(self, mapTables, ifNotExists=True):
|
||||
"""Create all required foreign keys."""
|
||||
if not HAS_MC:
|
||||
return
|
||||
# It seems that there's no reason to prevent the creation of
|
||||
# indexes for columns with FK constrains: if there's already
|
||||
# an index, the FK index is not created.
|
||||
countCols = 0
|
||||
for col in self._imdbpySchema.cols:
|
||||
countCols += 1
|
||||
if not col.foreignKey:
|
||||
continue
|
||||
fks = col.foreignKey.split('.', 1)
|
||||
foreignTableName = fks[0]
|
||||
if len(fks) == 2:
|
||||
foreignColName = fks[1]
|
||||
else:
|
||||
foreignColName = 'id'
|
||||
foreignColName = mapTables[foreignTableName].colMap.get(
|
||||
foreignColName, foreignColName)
|
||||
thisColName = self.colMap.get(col.name, col.name)
|
||||
thisCol = self.table.columns[thisColName]
|
||||
foreignTable = mapTables[foreignTableName].table
|
||||
foreignCol = getattr(foreignTable.c, foreignColName)
|
||||
# Need to explicitly set an unique name, otherwise it will
|
||||
# explode, if two cols points to the same table.
|
||||
fkName = 'fk_%s_%s_%d' % (foreignTable.name, foreignColName,
|
||||
countCols)
|
||||
constrain = migrate.changeset.ForeignKeyConstraint([thisCol],
|
||||
[foreignCol],
|
||||
name=fkName)
|
||||
try:
|
||||
constrain.create()
|
||||
except exc.OperationalError:
|
||||
continue
|
||||
|
||||
def __call__(self, *args, **kwds):
|
||||
"""To insert a new row with the syntax: TableClass(key=value, ...)"""
|
||||
taArgs = {}
|
||||
for key, value in kwds.items():
|
||||
taArgs[self.colMap.get(key, key)] = value
|
||||
self._ta_insert.execute(*args, **taArgs)
|
||||
|
||||
def __repr__(self):
|
||||
return '<TableAdapter(table=%s) [id=%s]>' % (repr(self.table), id(self))
|
||||
|
||||
|
||||
# Module-level "cache" for SQLObject classes, to prevent
|
||||
# "Table 'tableName' is already defined for this MetaData instance" errors,
|
||||
# when two or more connections to the database are made.
|
||||
# XXX: is this the best way to act?
|
||||
TABLES_REPOSITORY = {}
|
||||
|
||||
def getDBTables(uri=None):
|
||||
"""Return a list of TableAdapter objects to be used to access the
|
||||
database through the SQLAlchemy ORM. The connection uri is optional, and
|
||||
can be used to tailor the db schema to specific needs."""
|
||||
DB_TABLES = []
|
||||
for table in DB_SCHEMA:
|
||||
if table.name in TABLES_REPOSITORY:
|
||||
DB_TABLES.append(TABLES_REPOSITORY[table.name])
|
||||
continue
|
||||
tableAdapter = TableAdapter(table, uri)
|
||||
DB_TABLES.append(tableAdapter)
|
||||
TABLES_REPOSITORY[table.name] = tableAdapter
|
||||
return DB_TABLES
|
||||
|
||||
|
||||
# Functions used to emulate SQLObject's logical operators.
|
||||
def AND(*params):
|
||||
"""Emulate SQLObject's AND."""
|
||||
return and_(*params)
|
||||
|
||||
def OR(*params):
|
||||
"""Emulate SQLObject's OR."""
|
||||
return or_(*params)
|
||||
|
||||
def IN(item, inList):
|
||||
"""Emulate SQLObject's IN."""
|
||||
if not isinstance(item, schema.Column):
|
||||
return OR(*[x == item for x in inList])
|
||||
else:
|
||||
return item.in_(inList)
|
||||
|
||||
def ISNULL(x):
|
||||
"""Emulate SQLObject's ISNULL."""
|
||||
# XXX: Should we use null()? Can null() be a global instance?
|
||||
# XXX: Is it safe to test None with the == operator, in this case?
|
||||
return x == None
|
||||
|
||||
def ISNOTNULL(x):
|
||||
"""Emulate SQLObject's ISNOTNULL."""
|
||||
return x != None
|
||||
|
||||
def CONTAINSSTRING(expr, pattern):
|
||||
"""Emulate SQLObject's CONTAINSSTRING."""
|
||||
return expr.like('%%%s%%' % pattern)
|
||||
|
||||
|
||||
def toUTF8(s):
|
||||
"""For some strange reason, sometimes SQLObject wants utf8 strings
|
||||
instead of unicode; with SQLAlchemy we just return the unicode text."""
|
||||
return s
|
||||
|
||||
|
||||
class _AlchemyConnection(object):
|
||||
"""A proxy for the connection object, required since _ConnectionFairy
|
||||
uses __slots__."""
|
||||
def __init__(self, conn):
|
||||
self.conn = conn
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self.conn, name)
|
||||
|
||||
|
||||
def setConnection(uri, tables, encoding='utf8', debug=False):
|
||||
"""Set connection for every table."""
|
||||
# FIXME: why on earth MySQL requires an additional parameter,
|
||||
# is well beyond my understanding...
|
||||
if uri.startswith('mysql'):
|
||||
if '?' in uri:
|
||||
uri += '&'
|
||||
else:
|
||||
uri += '?'
|
||||
uri += 'charset=%s' % encoding
|
||||
params = {'encoding': encoding}
|
||||
if debug:
|
||||
params['echo'] = True
|
||||
if uri.startswith('ibm_db'):
|
||||
# Try to work-around a possible bug of the ibm_db DB2 driver.
|
||||
params['convert_unicode'] = True
|
||||
# XXX: is this the best way to connect?
|
||||
engine = create_engine(uri, **params)
|
||||
metadata.bind = engine
|
||||
eng_conn = engine.connect()
|
||||
if uri.startswith('sqlite'):
|
||||
major = sys.version_info[0]
|
||||
minor = sys.version_info[1]
|
||||
if major > 2 or (major == 2 and minor > 5):
|
||||
eng_conn.connection.connection.text_factory = str
|
||||
# XXX: OH MY, THAT'S A MESS!
|
||||
# We need to return a "connection" object, with the .dbName
|
||||
# attribute set to the db engine name (e.g. "mysql"), .paramstyle
|
||||
# set to the style of the paramters for query() calls, and the
|
||||
# .module attribute set to a module (?) with .OperationalError and
|
||||
# .IntegrityError attributes.
|
||||
# Another attribute of "connection" is the getConnection() function,
|
||||
# used to return an object with a .cursor() method.
|
||||
connection = _AlchemyConnection(eng_conn.connection)
|
||||
paramstyle = eng_conn.dialect.paramstyle
|
||||
connection.module = eng_conn.dialect.dbapi
|
||||
connection.paramstyle = paramstyle
|
||||
connection.getConnection = lambda: connection.connection
|
||||
connection.dbName = engine.url.drivername
|
||||
return connection
|
||||
|
||||
|
||||
@@ -1,269 +0,0 @@
|
||||
/*
|
||||
* cutils.c module.
|
||||
*
|
||||
* Miscellaneous functions to speed up the IMDbPY package.
|
||||
*
|
||||
* Contents:
|
||||
* - pyratcliff():
|
||||
* Function that implements the Ratcliff-Obershelp comparison
|
||||
* amongst Python strings.
|
||||
*
|
||||
* - pysoundex():
|
||||
* Return a soundex code string, for the given string.
|
||||
*
|
||||
* Copyright 2004-2009 Davide Alberani <da@erlug.linux.it>
|
||||
* Released under the GPL license.
|
||||
*
|
||||
* NOTE: The Ratcliff-Obershelp part was heavily based on code from the
|
||||
* "simil" Python module.
|
||||
* The "simil" module is copyright of Luca Montecchiani <cbm64 _at_ inwind.it>
|
||||
* and can be found here: http://spazioinwind.libero.it/montecchiani/
|
||||
* It was released under the GPL license; original comments are leaved
|
||||
* below.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/*========== Ratcliff-Obershelp ==========*/
|
||||
/*****************************************************************************
|
||||
*
|
||||
* Stolen code from :
|
||||
*
|
||||
* [Python-Dev] Why is soundex marked obsolete?
|
||||
* by Eric S. Raymond [4]esr@thyrsus.com
|
||||
* on Sun, 14 Jan 2001 14:09:01 -0500
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
/*****************************************************************************
|
||||
*
|
||||
* Ratcliff-Obershelp common-subpattern similarity.
|
||||
*
|
||||
* This code first appeared in a letter to the editor in Doctor
|
||||
* Dobbs's Journal, 11/1988. The original article on the algorithm,
|
||||
* "Pattern Matching by Gestalt" by John Ratcliff, had appeared in the
|
||||
* July 1988 issue (#181) but the algorithm was presented in assembly.
|
||||
* The main drawback of the Ratcliff-Obershelp algorithm is the cost
|
||||
* of the pairwise comparisons. It is significantly more expensive
|
||||
* than stemming, Hamming distance, soundex, and the like.
|
||||
*
|
||||
* Running time quadratic in the data size, memory usage constant.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#include <Python.h>
|
||||
|
||||
#define DONTCOMPARE_NULL 0.0
|
||||
#define DONTCOMPARE_SAME 1.0
|
||||
#define COMPARE 2.0
|
||||
#define STRING_MAXLENDIFFER 0.7
|
||||
|
||||
/* As of 05 Mar 2008, the longest title is ~600 chars. */
|
||||
#define MXLINELEN 1023
|
||||
|
||||
#define MAX(a,b) ((a) > (b) ? (a) : (b))
|
||||
|
||||
|
||||
//*****************************************
|
||||
// preliminary check....
|
||||
//*****************************************
|
||||
static float
|
||||
strings_check(char const *s, char const *t)
|
||||
{
|
||||
float threshold; // lenght difference
|
||||
int s_len = strlen(s); // length of s
|
||||
int t_len = strlen(t); // length of t
|
||||
|
||||
// NULL strings ?
|
||||
if ((t_len * s_len) == 0)
|
||||
return (DONTCOMPARE_NULL);
|
||||
|
||||
// the same ?
|
||||
if (strcmp(s, t) == 0)
|
||||
return (DONTCOMPARE_SAME);
|
||||
|
||||
// string lenght difference threshold
|
||||
// we don't want to compare too different lenght strings ;)
|
||||
if (s_len < t_len)
|
||||
threshold = (float) s_len / (float) t_len;
|
||||
else
|
||||
threshold = (float) t_len / (float) s_len;
|
||||
if (threshold < STRING_MAXLENDIFFER)
|
||||
return (DONTCOMPARE_NULL);
|
||||
|
||||
// proceed
|
||||
return (COMPARE);
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
RatcliffObershelp(char *st1, char *end1, char *st2, char *end2)
|
||||
{
|
||||
register char *a1, *a2;
|
||||
char *b1, *b2;
|
||||
char *s1 = st1, *s2 = st2; /* initializations are just to pacify GCC */
|
||||
short max, i;
|
||||
|
||||
if (end1 <= st1 || end2 <= st2)
|
||||
return (0);
|
||||
if (end1 == st1 + 1 && end2 == st2 + 1)
|
||||
return (0);
|
||||
|
||||
max = 0;
|
||||
b1 = end1;
|
||||
b2 = end2;
|
||||
|
||||
for (a1 = st1; a1 < b1; a1++) {
|
||||
for (a2 = st2; a2 < b2; a2++) {
|
||||
if (*a1 == *a2) {
|
||||
/* determine length of common substring */
|
||||
for (i = 1; a1[i] && (a1[i] == a2[i]); i++)
|
||||
continue;
|
||||
if (i > max) {
|
||||
max = i;
|
||||
s1 = a1;
|
||||
s2 = a2;
|
||||
b1 = end1 - max;
|
||||
b2 = end2 - max;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!max)
|
||||
return (0);
|
||||
max += RatcliffObershelp(s1 + max, end1, s2 + max, end2); /* rhs */
|
||||
max += RatcliffObershelp(st1, s1, st2, s2); /* lhs */
|
||||
return max;
|
||||
}
|
||||
|
||||
|
||||
static float
|
||||
ratcliff(char *s1, char *s2)
|
||||
/* compute Ratcliff-Obershelp similarity of two strings */
|
||||
{
|
||||
int l1, l2;
|
||||
float res;
|
||||
|
||||
// preliminary tests
|
||||
res = strings_check(s1, s2);
|
||||
if (res != COMPARE)
|
||||
return(res);
|
||||
|
||||
l1 = strlen(s1);
|
||||
l2 = strlen(s2);
|
||||
|
||||
return 2.0 * RatcliffObershelp(s1, s1 + l1, s2, s2 + l2) / (l1 + l2);
|
||||
}
|
||||
|
||||
|
||||
/* Change a string to lowercase. */
|
||||
static void
|
||||
strtolower(char *s1)
|
||||
{
|
||||
int i;
|
||||
for (i=0; i < strlen(s1); i++) s1[i] = tolower(s1[i]);
|
||||
}
|
||||
|
||||
|
||||
/* Ratcliff-Obershelp for two python strings; returns a python float. */
|
||||
static PyObject*
|
||||
pyratcliff(PyObject *self, PyObject *pArgs)
|
||||
{
|
||||
char *s1 = NULL;
|
||||
char *s2 = NULL;
|
||||
PyObject *discard = NULL;
|
||||
char s1copy[MXLINELEN+1];
|
||||
char s2copy[MXLINELEN+1];
|
||||
|
||||
/* The optional PyObject parameter is here to be compatible
|
||||
* with the pure python implementation, which uses a
|
||||
* difflib.SequenceMatcher object. */
|
||||
if (!PyArg_ParseTuple(pArgs, "ss|O", &s1, &s2, &discard))
|
||||
return NULL;
|
||||
|
||||
strncpy(s1copy, s1, MXLINELEN);
|
||||
strncpy(s2copy, s2, MXLINELEN);
|
||||
/* Work on copies. */
|
||||
strtolower(s1copy);
|
||||
strtolower(s2copy);
|
||||
|
||||
return Py_BuildValue("f", ratcliff(s1copy, s2copy));
|
||||
}
|
||||
|
||||
|
||||
/*========== soundex ==========*/
|
||||
/* Max length of the soundex code to output (an uppercase char and
|
||||
* _at most_ 4 digits). */
|
||||
#define SOUNDEX_LEN 5
|
||||
|
||||
/* Group Number Lookup Table */
|
||||
static char soundTable[26] =
|
||||
{ 0 /* A */, '1' /* B */, '2' /* C */, '3' /* D */, 0 /* E */, '1' /* F */,
|
||||
'2' /* G */, 0 /* H */, 0 /* I */, '2' /* J */, '2' /* K */, '4' /* L */,
|
||||
'5' /* M */, '5' /* N */, 0 /* O */, '1' /* P */, '2' /* Q */, '6' /* R */,
|
||||
'2' /* S */, '3' /* T */, 0 /* U */, '1' /* V */, 0 /* W */, '2' /* X */,
|
||||
0 /* Y */, '2' /* Z */};
|
||||
|
||||
static PyObject*
|
||||
pysoundex(PyObject *self, PyObject *pArgs)
|
||||
{
|
||||
int i, j, n;
|
||||
char *s = NULL;
|
||||
char word[MXLINELEN+1];
|
||||
char soundCode[SOUNDEX_LEN+1];
|
||||
char c;
|
||||
|
||||
if (!PyArg_ParseTuple(pArgs, "s", &s))
|
||||
return NULL;
|
||||
|
||||
j = 0;
|
||||
n = strlen(s);
|
||||
|
||||
/* Convert to uppercase and exclude non-ascii chars. */
|
||||
for (i = 0; i < n; i++) {
|
||||
c = toupper(s[i]);
|
||||
if (c < 91 && c > 64) {
|
||||
word[j] = c;
|
||||
j++;
|
||||
}
|
||||
}
|
||||
word[j] = '\0';
|
||||
|
||||
n = strlen(word);
|
||||
if (n == 0) {
|
||||
/* If the string is empty, returns None. */
|
||||
return Py_BuildValue("");
|
||||
}
|
||||
soundCode[0] = word[0];
|
||||
|
||||
/* Build the soundCode string. */
|
||||
j = 1;
|
||||
for (i = 1; j < SOUNDEX_LEN && i < n; i++) {
|
||||
c = soundTable[(word[i]-65)];
|
||||
/* Compact zeroes and equal consecutive digits ("12234112"->"123412") */
|
||||
if (c != 0 && c != soundCode[j-1]) {
|
||||
soundCode[j++] = c;
|
||||
}
|
||||
}
|
||||
soundCode[j] = '\0';
|
||||
|
||||
return Py_BuildValue("s", soundCode);
|
||||
}
|
||||
|
||||
|
||||
static PyMethodDef cutils_methods[] = {
|
||||
{"ratcliff", pyratcliff,
|
||||
METH_VARARGS, "Ratcliff-Obershelp similarity."},
|
||||
{"soundex", pysoundex,
|
||||
METH_VARARGS, "Soundex code for strings."},
|
||||
{NULL}
|
||||
};
|
||||
|
||||
|
||||
void
|
||||
initcutils(void)
|
||||
{
|
||||
Py_InitModule("cutils", cutils_methods);
|
||||
}
|
||||
|
||||
|
||||
@@ -1,461 +0,0 @@
|
||||
#-*- encoding: utf-8 -*-
|
||||
"""
|
||||
parser.sql.dbschema module (imdb.parser.sql package).
|
||||
|
||||
This module provides the schema used to describe the layout of the
|
||||
database used by the imdb.parser.sql package; functions to create/drop
|
||||
tables and indexes are also provided.
|
||||
|
||||
Copyright 2005-2010 Davide Alberani <da@erlug.linux.it>
|
||||
2006 Giuseppe "Cowo" Corbelli <cowo --> lugbs.linux.it>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
_dbschema_logger = logging.getLogger('imdbpy.parser.sql.dbschema')
|
||||
|
||||
|
||||
# Placeholders for column types.
|
||||
INTCOL = 1
|
||||
UNICODECOL = 2
|
||||
STRINGCOL = 3
|
||||
_strMap = {1: 'INTCOL', 2: 'UNICODECOL', 3: 'STRINGCOL'}
|
||||
|
||||
class DBCol(object):
|
||||
"""Define column objects."""
|
||||
def __init__(self, name, kind, **params):
|
||||
self.name = name
|
||||
self.kind = kind
|
||||
self.index = None
|
||||
self.indexLen = None
|
||||
# If not None, two notations are accepted: 'TableName'
|
||||
# and 'TableName.ColName'; in the first case, 'id' is assumed
|
||||
# as the name of the pointed column.
|
||||
self.foreignKey = None
|
||||
if 'index' in params:
|
||||
self.index = params['index']
|
||||
del params['index']
|
||||
if 'indexLen' in params:
|
||||
self.indexLen = params['indexLen']
|
||||
del params['indexLen']
|
||||
if 'foreignKey' in params:
|
||||
self.foreignKey = params['foreignKey']
|
||||
del params['foreignKey']
|
||||
self.params = params
|
||||
|
||||
def __str__(self):
|
||||
"""Class representation."""
|
||||
s = '<DBCol %s %s' % (self.name, _strMap[self.kind])
|
||||
if self.index:
|
||||
s += ' INDEX'
|
||||
if self.indexLen:
|
||||
s += '[:%d]' % self.indexLen
|
||||
if self.foreignKey:
|
||||
s += ' FOREIGN'
|
||||
if 'default' in self.params:
|
||||
val = self.params['default']
|
||||
if val is not None:
|
||||
val = '"%s"' % val
|
||||
s += ' DEFAULT=%s' % val
|
||||
for param in self.params:
|
||||
if param == 'default': continue
|
||||
s += ' %s' % param.upper()
|
||||
s += '>'
|
||||
return s
|
||||
|
||||
def __repr__(self):
|
||||
"""Class representation."""
|
||||
s = '<DBCol(name="%s", %s' % (self.name, _strMap[self.kind])
|
||||
if self.index:
|
||||
s += ', index="%s"' % self.index
|
||||
if self.indexLen:
|
||||
s += ', indexLen=%d' % self.indexLen
|
||||
if self.foreignKey:
|
||||
s += ', foreignKey="%s"' % self.foreignKey
|
||||
for param in self.params:
|
||||
val = self.params[param]
|
||||
if isinstance(val, (unicode, str)):
|
||||
val = u'"%s"' % val
|
||||
s += ', %s=%s' % (param, val)
|
||||
s += ')>'
|
||||
return s
|
||||
|
||||
|
||||
class DBTable(object):
|
||||
"""Define table objects."""
|
||||
def __init__(self, name, *cols, **kwds):
|
||||
self.name = name
|
||||
self.cols = cols
|
||||
# Default values.
|
||||
self.values = kwds.get('values', {})
|
||||
|
||||
def __str__(self):
|
||||
"""Class representation."""
|
||||
return '<DBTable %s (%d cols, %d values)>' % (self.name,
|
||||
len(self.cols), sum([len(v) for v in self.values.values()]))
|
||||
|
||||
def __repr__(self):
|
||||
"""Class representation."""
|
||||
s = '<DBTable(name="%s"' % self.name
|
||||
col_s = ', '.join([repr(col).rstrip('>').lstrip('<')
|
||||
for col in self.cols])
|
||||
if col_s:
|
||||
s += ', %s' % col_s
|
||||
if self.values:
|
||||
s += ', values=%s' % self.values
|
||||
s += ')>'
|
||||
return s
|
||||
|
||||
|
||||
# Default values to insert in some tables: {'column': (list, of, values, ...)}
|
||||
kindTypeDefs = {'kind': ('movie', 'tv series', 'tv movie', 'video movie',
|
||||
'tv mini series', 'video game', 'episode')}
|
||||
companyTypeDefs = {'kind': ('distributors', 'production companies',
|
||||
'special effects companies', 'miscellaneous companies')}
|
||||
infoTypeDefs = {'info': ('runtimes', 'color info', 'genres', 'languages',
|
||||
'certificates', 'sound mix', 'tech info', 'countries', 'taglines',
|
||||
'keywords', 'alternate versions', 'crazy credits', 'goofs',
|
||||
'soundtrack', 'quotes', 'release dates', 'trivia', 'locations',
|
||||
'mini biography', 'birth notes', 'birth date', 'height',
|
||||
'death date', 'spouse', 'other works', 'birth name',
|
||||
'salary history', 'nick names', 'books', 'agent address',
|
||||
'biographical movies', 'portrayed in', 'where now', 'trade mark',
|
||||
'interviews', 'article', 'magazine cover photo', 'pictorial',
|
||||
'death notes', 'LD disc format', 'LD year', 'LD digital sound',
|
||||
'LD official retail price', 'LD frequency response', 'LD pressing plant',
|
||||
'LD length', 'LD language', 'LD review', 'LD spaciality', 'LD release date',
|
||||
'LD production country', 'LD contrast', 'LD color rendition',
|
||||
'LD picture format', 'LD video noise', 'LD video artifacts',
|
||||
'LD release country', 'LD sharpness', 'LD dynamic range',
|
||||
'LD audio noise', 'LD color information', 'LD group genre',
|
||||
'LD quality program', 'LD close captions-teletext-ld-g',
|
||||
'LD category', 'LD analog left', 'LD certification',
|
||||
'LD audio quality', 'LD video quality', 'LD aspect ratio',
|
||||
'LD analog right', 'LD additional information',
|
||||
'LD number of chapter stops', 'LD dialogue intellegibility',
|
||||
'LD disc size', 'LD master format', 'LD subtitles',
|
||||
'LD status of availablility', 'LD quality of source',
|
||||
'LD number of sides', 'LD video standard', 'LD supplement',
|
||||
'LD original title', 'LD sound encoding', 'LD number', 'LD label',
|
||||
'LD catalog number', 'LD laserdisc title', 'screenplay-teleplay',
|
||||
'novel', 'adaption', 'book', 'production process protocol',
|
||||
'printed media reviews', 'essays', 'other literature', 'mpaa',
|
||||
'plot', 'votes distribution', 'votes', 'rating',
|
||||
'production dates', 'copyright holder', 'filming dates', 'budget',
|
||||
'weekend gross', 'gross', 'opening weekend', 'rentals',
|
||||
'admissions', 'studios', 'top 250 rank', 'bottom 10 rank')}
|
||||
compCastTypeDefs = {'kind': ('cast', 'crew', 'complete', 'complete+verified')}
|
||||
linkTypeDefs = {'link': ('follows', 'followed by', 'remake of', 'remade as',
|
||||
'references', 'referenced in', 'spoofs', 'spoofed in',
|
||||
'features', 'featured in', 'spin off from', 'spin off',
|
||||
'version of', 'similar to', 'edited into',
|
||||
'edited from', 'alternate language version of',
|
||||
'unknown link')}
|
||||
roleTypeDefs = {'role': ('actor', 'actress', 'producer', 'writer',
|
||||
'cinematographer', 'composer', 'costume designer',
|
||||
'director', 'editor', 'miscellaneous crew',
|
||||
'production designer', 'guest')}
|
||||
|
||||
# Schema of tables in our database.
|
||||
# XXX: Foreign keys can be used to create constrains between tables,
|
||||
# but they create indexes in the database, and this
|
||||
# means poor performances at insert-time.
|
||||
DB_SCHEMA = [
|
||||
DBTable('Name',
|
||||
# namePcodeCf is the soundex of the name in the canonical format.
|
||||
# namePcodeNf is the soundex of the name in the normal format, if
|
||||
# different from namePcodeCf.
|
||||
# surnamePcode is the soundex of the surname, if different from the
|
||||
# other two values.
|
||||
|
||||
# The 'id' column is simply skipped by SQLObject (it's a default);
|
||||
# the alternateID attribute here will be ignored by SQLAlchemy.
|
||||
DBCol('id', INTCOL, notNone=True, alternateID=True),
|
||||
DBCol('name', UNICODECOL, notNone=True, index='idx_name', indexLen=6),
|
||||
DBCol('imdbIndex', UNICODECOL, length=12, default=None),
|
||||
DBCol('imdbID', INTCOL, default=None),
|
||||
DBCol('namePcodeCf', STRINGCOL, length=5, default=None,
|
||||
index='idx_pcodecf'),
|
||||
DBCol('namePcodeNf', STRINGCOL, length=5, default=None,
|
||||
index='idx_pcodenf'),
|
||||
DBCol('surnamePcode', STRINGCOL, length=5, default=None,
|
||||
index='idx_pcode'),
|
||||
DBCol('md5sum', STRINGCOL, length=32, default=None, index='idx_md5')
|
||||
),
|
||||
|
||||
DBTable('CharName',
|
||||
# namePcodeNf is the soundex of the name in the normal format.
|
||||
# surnamePcode is the soundex of the surname, if different
|
||||
# from namePcodeNf.
|
||||
DBCol('id', INTCOL, notNone=True, alternateID=True),
|
||||
DBCol('name', UNICODECOL, notNone=True, index='idx_name', indexLen=6),
|
||||
DBCol('imdbIndex', UNICODECOL, length=12, default=None),
|
||||
DBCol('imdbID', INTCOL, default=None),
|
||||
DBCol('namePcodeNf', STRINGCOL, length=5, default=None,
|
||||
index='idx_pcodenf'),
|
||||
DBCol('surnamePcode', STRINGCOL, length=5, default=None,
|
||||
index='idx_pcode'),
|
||||
DBCol('md5sum', STRINGCOL, length=32, default=None, index='idx_md5')
|
||||
),
|
||||
|
||||
DBTable('CompanyName',
|
||||
# namePcodeNf is the soundex of the name in the normal format.
|
||||
# namePcodeSf is the soundex of the name plus the country code.
|
||||
DBCol('id', INTCOL, notNone=True, alternateID=True),
|
||||
DBCol('name', UNICODECOL, notNone=True, index='idx_name', indexLen=6),
|
||||
DBCol('countryCode', UNICODECOL, length=255, default=None),
|
||||
DBCol('imdbID', INTCOL, default=None),
|
||||
DBCol('namePcodeNf', STRINGCOL, length=5, default=None,
|
||||
index='idx_pcodenf'),
|
||||
DBCol('namePcodeSf', STRINGCOL, length=5, default=None,
|
||||
index='idx_pcodesf'),
|
||||
DBCol('md5sum', STRINGCOL, length=32, default=None, index='idx_md5')
|
||||
),
|
||||
|
||||
DBTable('KindType',
|
||||
DBCol('id', INTCOL, notNone=True, alternateID=True),
|
||||
DBCol('kind', STRINGCOL, length=15, default=None, alternateID=True),
|
||||
values=kindTypeDefs
|
||||
),
|
||||
|
||||
DBTable('Title',
|
||||
DBCol('id', INTCOL, notNone=True, alternateID=True),
|
||||
DBCol('title', UNICODECOL, notNone=True,
|
||||
index='idx_title', indexLen=10),
|
||||
DBCol('imdbIndex', UNICODECOL, length=12, default=None),
|
||||
DBCol('kindID', INTCOL, notNone=True, foreignKey='KindType'),
|
||||
DBCol('productionYear', INTCOL, default=None),
|
||||
DBCol('imdbID', INTCOL, default=None),
|
||||
DBCol('phoneticCode', STRINGCOL, length=5, default=None,
|
||||
index='idx_pcode'),
|
||||
DBCol('episodeOfID', INTCOL, default=None, index='idx_epof',
|
||||
foreignKey='Title'),
|
||||
DBCol('seasonNr', INTCOL, default=None),
|
||||
DBCol('episodeNr', INTCOL, default=None),
|
||||
# Maximum observed length is 44; 49 can store 5 comma-separated
|
||||
# year-year pairs.
|
||||
DBCol('seriesYears', STRINGCOL, length=49, default=None),
|
||||
DBCol('md5sum', STRINGCOL, length=32, default=None, index='idx_md5')
|
||||
),
|
||||
|
||||
DBTable('CompanyType',
|
||||
DBCol('id', INTCOL, notNone=True, alternateID=True),
|
||||
DBCol('kind', STRINGCOL, length=32, default=None, alternateID=True),
|
||||
values=companyTypeDefs
|
||||
),
|
||||
|
||||
DBTable('AkaName',
|
||||
DBCol('id', INTCOL, notNone=True, alternateID=True),
|
||||
DBCol('personID', INTCOL, notNone=True, index='idx_person',
|
||||
foreignKey='Name'),
|
||||
DBCol('name', UNICODECOL, notNone=True),
|
||||
DBCol('imdbIndex', UNICODECOL, length=12, default=None),
|
||||
DBCol('namePcodeCf', STRINGCOL, length=5, default=None,
|
||||
index='idx_pcodecf'),
|
||||
DBCol('namePcodeNf', STRINGCOL, length=5, default=None,
|
||||
index='idx_pcodenf'),
|
||||
DBCol('surnamePcode', STRINGCOL, length=5, default=None,
|
||||
index='idx_pcode'),
|
||||
DBCol('md5sum', STRINGCOL, length=32, default=None, index='idx_md5')
|
||||
),
|
||||
|
||||
DBTable('AkaTitle',
|
||||
# XXX: It's safer to set notNone to False, here.
|
||||
# alias for akas are stored completely in the AkaTitle table;
|
||||
# this means that episodes will set also a "tv series" alias name.
|
||||
# Reading the aka-title.list file it looks like there are
|
||||
# episode titles with aliases to different titles for both
|
||||
# the episode and the series title, while for just the series
|
||||
# there are no aliases.
|
||||
# E.g.:
|
||||
# aka title original title
|
||||
# "Series, The" (2005) {The Episode} "Other Title" (2005) {Other Title}
|
||||
# But there is no:
|
||||
# "Series, The" (2005) "Other Title" (2005)
|
||||
DBCol('id', INTCOL, notNone=True, alternateID=True),
|
||||
DBCol('movieID', INTCOL, notNone=True, index='idx_movieid',
|
||||
foreignKey='Title'),
|
||||
DBCol('title', UNICODECOL, notNone=True),
|
||||
DBCol('imdbIndex', UNICODECOL, length=12, default=None),
|
||||
DBCol('kindID', INTCOL, notNone=True, foreignKey='KindType'),
|
||||
DBCol('productionYear', INTCOL, default=None),
|
||||
DBCol('phoneticCode', STRINGCOL, length=5, default=None,
|
||||
index='idx_pcode'),
|
||||
DBCol('episodeOfID', INTCOL, default=None, index='idx_epof',
|
||||
foreignKey='AkaTitle'),
|
||||
DBCol('seasonNr', INTCOL, default=None),
|
||||
DBCol('episodeNr', INTCOL, default=None),
|
||||
DBCol('note', UNICODECOL, default=None),
|
||||
DBCol('md5sum', STRINGCOL, length=32, default=None, index='idx_md5')
|
||||
),
|
||||
|
||||
DBTable('RoleType',
|
||||
DBCol('id', INTCOL, notNone=True, alternateID=True),
|
||||
DBCol('role', STRINGCOL, length=32, notNone=True, alternateID=True),
|
||||
values=roleTypeDefs
|
||||
),
|
||||
|
||||
DBTable('CastInfo',
|
||||
DBCol('id', INTCOL, notNone=True, alternateID=True),
|
||||
DBCol('personID', INTCOL, notNone=True, index='idx_pid',
|
||||
foreignKey='Name'),
|
||||
DBCol('movieID', INTCOL, notNone=True, index='idx_mid',
|
||||
foreignKey='Title'),
|
||||
DBCol('personRoleID', INTCOL, default=None, index='idx_cid',
|
||||
foreignKey='CharName'),
|
||||
DBCol('note', UNICODECOL, default=None),
|
||||
DBCol('nrOrder', INTCOL, default=None),
|
||||
DBCol('roleID', INTCOL, notNone=True, foreignKey='RoleType')
|
||||
),
|
||||
|
||||
DBTable('CompCastType',
|
||||
DBCol('id', INTCOL, notNone=True, alternateID=True),
|
||||
DBCol('kind', STRINGCOL, length=32, notNone=True, alternateID=True),
|
||||
values=compCastTypeDefs
|
||||
),
|
||||
|
||||
DBTable('CompleteCast',
|
||||
DBCol('id', INTCOL, notNone=True, alternateID=True),
|
||||
DBCol('movieID', INTCOL, index='idx_mid', foreignKey='Title'),
|
||||
DBCol('subjectID', INTCOL, notNone=True, foreignKey='CompCastType'),
|
||||
DBCol('statusID', INTCOL, notNone=True, foreignKey='CompCastType')
|
||||
),
|
||||
|
||||
DBTable('InfoType',
|
||||
DBCol('id', INTCOL, notNone=True, alternateID=True),
|
||||
DBCol('info', STRINGCOL, length=32, notNone=True, alternateID=True),
|
||||
values=infoTypeDefs
|
||||
),
|
||||
|
||||
DBTable('LinkType',
|
||||
DBCol('id', INTCOL, notNone=True, alternateID=True),
|
||||
DBCol('link', STRINGCOL, length=32, notNone=True, alternateID=True),
|
||||
values=linkTypeDefs
|
||||
),
|
||||
|
||||
DBTable('Keyword',
|
||||
DBCol('id', INTCOL, notNone=True, alternateID=True),
|
||||
# XXX: can't use alternateID=True, because it would create
|
||||
# a UNIQUE index; unfortunately (at least with a common
|
||||
# collation like utf8_unicode_ci) MySQL will consider
|
||||
# some different keywords identical - like
|
||||
# "fiancée" and "fiancee".
|
||||
DBCol('keyword', UNICODECOL, length=255, notNone=True,
|
||||
index='idx_keyword', indexLen=5),
|
||||
DBCol('phoneticCode', STRINGCOL, length=5, default=None,
|
||||
index='idx_pcode')
|
||||
),
|
||||
|
||||
DBTable('MovieKeyword',
|
||||
DBCol('id', INTCOL, notNone=True, alternateID=True),
|
||||
DBCol('movieID', INTCOL, notNone=True, index='idx_mid',
|
||||
foreignKey='Title'),
|
||||
DBCol('keywordID', INTCOL, notNone=True, index='idx_keywordid',
|
||||
foreignKey='Keyword')
|
||||
),
|
||||
|
||||
DBTable('MovieLink',
|
||||
DBCol('id', INTCOL, notNone=True, alternateID=True),
|
||||
DBCol('movieID', INTCOL, notNone=True, index='idx_mid',
|
||||
foreignKey='Title'),
|
||||
DBCol('linkedMovieID', INTCOL, notNone=True, foreignKey='Title'),
|
||||
DBCol('linkTypeID', INTCOL, notNone=True, foreignKey='LinkType')
|
||||
),
|
||||
|
||||
DBTable('MovieInfo',
|
||||
DBCol('id', INTCOL, notNone=True, alternateID=True),
|
||||
DBCol('movieID', INTCOL, notNone=True, index='idx_mid',
|
||||
foreignKey='Title'),
|
||||
DBCol('infoTypeID', INTCOL, notNone=True, foreignKey='InfoType'),
|
||||
DBCol('info', UNICODECOL, notNone=True),
|
||||
DBCol('note', UNICODECOL, default=None)
|
||||
),
|
||||
|
||||
# This table is identical to MovieInfo, except that both 'infoTypeID'
|
||||
# and 'info' are indexed.
|
||||
DBTable('MovieInfoIdx',
|
||||
DBCol('id', INTCOL, notNone=True, alternateID=True),
|
||||
DBCol('movieID', INTCOL, notNone=True, index='idx_mid',
|
||||
foreignKey='Title'),
|
||||
DBCol('infoTypeID', INTCOL, notNone=True, index='idx_infotypeid',
|
||||
foreignKey='InfoType'),
|
||||
DBCol('info', UNICODECOL, notNone=True, index='idx_info', indexLen=10),
|
||||
DBCol('note', UNICODECOL, default=None)
|
||||
),
|
||||
|
||||
DBTable('MovieCompanies',
|
||||
DBCol('id', INTCOL, notNone=True, alternateID=True),
|
||||
DBCol('movieID', INTCOL, notNone=True, index='idx_mid',
|
||||
foreignKey='Title'),
|
||||
DBCol('companyID', INTCOL, notNone=True, index='idx_cid',
|
||||
foreignKey='CompanyName'),
|
||||
DBCol('companyTypeID', INTCOL, notNone=True, foreignKey='CompanyType'),
|
||||
DBCol('note', UNICODECOL, default=None)
|
||||
),
|
||||
|
||||
DBTable('PersonInfo',
|
||||
DBCol('id', INTCOL, notNone=True, alternateID=True),
|
||||
DBCol('personID', INTCOL, notNone=True, index='idx_pid',
|
||||
foreignKey='Name'),
|
||||
DBCol('infoTypeID', INTCOL, notNone=True, foreignKey='InfoType'),
|
||||
DBCol('info', UNICODECOL, notNone=True),
|
||||
DBCol('note', UNICODECOL, default=None)
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
# Functions to manage tables.
|
||||
def dropTables(tables, ifExists=True):
|
||||
"""Drop the tables."""
|
||||
# In reverse order (useful to avoid errors about foreign keys).
|
||||
DB_TABLES_DROP = list(tables)
|
||||
DB_TABLES_DROP.reverse()
|
||||
for table in DB_TABLES_DROP:
|
||||
_dbschema_logger.info('dropping table %s', table._imdbpyName)
|
||||
table.dropTable(ifExists)
|
||||
|
||||
def createTables(tables, ifNotExists=True):
|
||||
"""Create the tables and insert default values."""
|
||||
for table in tables:
|
||||
# Create the table.
|
||||
_dbschema_logger.info('creating table %s', table._imdbpyName)
|
||||
table.createTable(ifNotExists)
|
||||
# Insert default values, if any.
|
||||
if table._imdbpySchema.values:
|
||||
_dbschema_logger.info('inserting values into table %s',
|
||||
table._imdbpyName)
|
||||
for key in table._imdbpySchema.values:
|
||||
for value in table._imdbpySchema.values[key]:
|
||||
table(**{key: unicode(value)})
|
||||
|
||||
def createIndexes(tables, ifNotExists=True):
|
||||
"""Create the indexes in the database."""
|
||||
for table in tables:
|
||||
_dbschema_logger.info('creating indexes for table %s',
|
||||
table._imdbpyName)
|
||||
table.addIndexes(ifNotExists)
|
||||
|
||||
def createForeignKeys(tables, ifNotExists=True):
|
||||
"""Create Foreign Keys."""
|
||||
mapTables = {}
|
||||
for table in tables:
|
||||
mapTables[table._imdbpyName] = table
|
||||
for table in tables:
|
||||
_dbschema_logger.info('creating foreign keys for table %s',
|
||||
table._imdbpyName)
|
||||
table.addForeignKeys(mapTables, ifNotExists)
|
||||
|
||||
@@ -1,203 +0,0 @@
|
||||
"""
|
||||
parser.sql.objectadapter module (imdb.parser.sql package).
|
||||
|
||||
This module adapts the SQLObject ORM to the internal mechanism.
|
||||
|
||||
Copyright 2008-2010 Davide Alberani <da@erlug.linux.it>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
"""
|
||||
|
||||
import sys
|
||||
import logging
|
||||
|
||||
from sqlobject import *
|
||||
from sqlobject.sqlbuilder import ISNULL, ISNOTNULL, AND, OR, IN, CONTAINSSTRING
|
||||
|
||||
from dbschema import *
|
||||
|
||||
_object_logger = logging.getLogger('imdbpy.parser.sql.object')
|
||||
|
||||
|
||||
# Maps our placeholders to SQLAlchemy's column types.
|
||||
MAP_COLS = {
|
||||
INTCOL: IntCol,
|
||||
UNICODECOL: UnicodeCol,
|
||||
STRINGCOL: StringCol
|
||||
}
|
||||
|
||||
|
||||
# Exception raised when Table.get(id) returns no value.
|
||||
NotFoundError = SQLObjectNotFound
|
||||
|
||||
|
||||
# class method to be added to the SQLObject class.
|
||||
def addIndexes(cls, ifNotExists=True):
|
||||
"""Create all required indexes."""
|
||||
for col in cls._imdbpySchema.cols:
|
||||
if col.index:
|
||||
idxName = col.index
|
||||
colToIdx = col.name
|
||||
if col.indexLen:
|
||||
colToIdx = {'column': col.name, 'length': col.indexLen}
|
||||
if idxName in [i.name for i in cls.sqlmeta.indexes]:
|
||||
# Check if the index is already present.
|
||||
continue
|
||||
idx = DatabaseIndex(colToIdx, name=idxName)
|
||||
cls.sqlmeta.addIndex(idx)
|
||||
try:
|
||||
cls.createIndexes(ifNotExists)
|
||||
except dberrors.OperationalError, e:
|
||||
_object_logger.warn('Skipping creation of the %s.%s index: %s' %
|
||||
(cls.sqlmeta.table, col.name, e))
|
||||
addIndexes = classmethod(addIndexes)
|
||||
|
||||
|
||||
# Global repository for "fake" tables with Foreign Keys - need to
|
||||
# prevent troubles if addForeignKeys is called more than one time.
|
||||
FAKE_TABLES_REPOSITORY = {}
|
||||
|
||||
def _buildFakeFKTable(cls, fakeTableName):
|
||||
"""Return a "fake" table, with foreign keys where needed."""
|
||||
countCols = 0
|
||||
attrs = {}
|
||||
for col in cls._imdbpySchema.cols:
|
||||
countCols += 1
|
||||
if col.name == 'id':
|
||||
continue
|
||||
if not col.foreignKey:
|
||||
# A non-foreign key column - add it as usual.
|
||||
attrs[col.name] = MAP_COLS[col.kind](**col.params)
|
||||
continue
|
||||
# XXX: Foreign Keys pointing to TableName.ColName not yet supported.
|
||||
thisColName = col.name
|
||||
if thisColName.endswith('ID'):
|
||||
thisColName = thisColName[:-2]
|
||||
|
||||
fks = col.foreignKey.split('.', 1)
|
||||
foreignTableName = fks[0]
|
||||
if len(fks) == 2:
|
||||
foreignColName = fks[1]
|
||||
else:
|
||||
foreignColName = 'id'
|
||||
# Unused...
|
||||
#fkName = 'fk_%s_%s_%d' % (foreignTableName, foreignColName,
|
||||
# countCols)
|
||||
# Create a Foreign Key column, with the correct references.
|
||||
fk = ForeignKey(foreignTableName, name=thisColName, default=None)
|
||||
attrs[thisColName] = fk
|
||||
# Build a _NEW_ SQLObject subclass, with foreign keys, if needed.
|
||||
newcls = type(fakeTableName, (SQLObject,), attrs)
|
||||
return newcls
|
||||
|
||||
def addForeignKeys(cls, mapTables, ifNotExists=True):
|
||||
"""Create all required foreign keys."""
|
||||
# Do not even try, if there are no FK, in this table.
|
||||
if not filter(None, [col.foreignKey for col in cls._imdbpySchema.cols]):
|
||||
return
|
||||
fakeTableName = 'myfaketable%s' % cls.sqlmeta.table
|
||||
if fakeTableName in FAKE_TABLES_REPOSITORY:
|
||||
newcls = FAKE_TABLES_REPOSITORY[fakeTableName]
|
||||
else:
|
||||
newcls = _buildFakeFKTable(cls, fakeTableName)
|
||||
FAKE_TABLES_REPOSITORY[fakeTableName] = newcls
|
||||
# Connect the class with foreign keys.
|
||||
newcls.setConnection(cls._connection)
|
||||
for col in cls._imdbpySchema.cols:
|
||||
if col.name == 'id':
|
||||
continue
|
||||
if not col.foreignKey:
|
||||
continue
|
||||
# Get the SQL that _WOULD BE_ run, if we had to create
|
||||
# this "fake" table.
|
||||
fkQuery = newcls._connection.createReferenceConstraint(newcls,
|
||||
newcls.sqlmeta.columns[col.name])
|
||||
if not fkQuery:
|
||||
# Probably the db doesn't support foreign keys (SQLite).
|
||||
continue
|
||||
# Remove "myfaketable" to get references to _real_ tables.
|
||||
fkQuery = fkQuery.replace('myfaketable', '')
|
||||
# Execute the query.
|
||||
newcls._connection.query(fkQuery)
|
||||
# Disconnect it.
|
||||
newcls._connection.close()
|
||||
addForeignKeys = classmethod(addForeignKeys)
|
||||
|
||||
|
||||
# Module-level "cache" for SQLObject classes, to prevent
|
||||
# "class TheClass is already in the registry" errors, when
|
||||
# two or more connections to the database are made.
|
||||
# XXX: is this the best way to act?
|
||||
TABLES_REPOSITORY = {}
|
||||
|
||||
def getDBTables(uri=None):
|
||||
"""Return a list of classes to be used to access the database
|
||||
through the SQLObject ORM. The connection uri is optional, and
|
||||
can be used to tailor the db schema to specific needs."""
|
||||
DB_TABLES = []
|
||||
for table in DB_SCHEMA:
|
||||
if table.name in TABLES_REPOSITORY:
|
||||
DB_TABLES.append(TABLES_REPOSITORY[table.name])
|
||||
continue
|
||||
attrs = {'_imdbpyName': table.name, '_imdbpySchema': table,
|
||||
'addIndexes': addIndexes, 'addForeignKeys': addForeignKeys}
|
||||
for col in table.cols:
|
||||
if col.name == 'id':
|
||||
continue
|
||||
attrs[col.name] = MAP_COLS[col.kind](**col.params)
|
||||
# Create a subclass of SQLObject.
|
||||
# XXX: use a metaclass? I can't see any advantage.
|
||||
cls = type(table.name, (SQLObject,), attrs)
|
||||
DB_TABLES.append(cls)
|
||||
TABLES_REPOSITORY[table.name] = cls
|
||||
return DB_TABLES
|
||||
|
||||
|
||||
def toUTF8(s):
|
||||
"""For some strange reason, sometimes SQLObject wants utf8 strings
|
||||
instead of unicode."""
|
||||
return s.encode('utf_8')
|
||||
|
||||
|
||||
def setConnection(uri, tables, encoding='utf8', debug=False):
|
||||
"""Set connection for every table."""
|
||||
kw = {}
|
||||
# FIXME: it's absolutely unclear what we should do to correctly
|
||||
# support unicode in MySQL; with some versions of SQLObject,
|
||||
# it seems that setting use_unicode=1 is the _wrong_ thing to do.
|
||||
_uriLower = uri.lower()
|
||||
if _uriLower.startswith('mysql'):
|
||||
kw['use_unicode'] = 1
|
||||
#kw['sqlobject_encoding'] = encoding
|
||||
kw['charset'] = encoding
|
||||
conn = connectionForURI(uri, **kw)
|
||||
conn.debug = debug
|
||||
if uri.startswith('sqlite'):
|
||||
major = sys.version_info[0]
|
||||
minor = sys.version_info[1]
|
||||
if major > 2 or (major == 2 and minor > 5):
|
||||
conn.connection.connection.text_factory = str
|
||||
for table in tables:
|
||||
table.setConnection(conn)
|
||||
#table.sqlmeta.cacheValues = False
|
||||
# FIXME: is it safe to set table._cacheValue to False? Looks like
|
||||
# we can't retrieve correct values after an update (I think
|
||||
# it's never needed, but...) Anyway, these are set to False
|
||||
# for performance reason at insert time (see imdbpy2sql.py).
|
||||
table._cacheValue = False
|
||||
# Required by imdbpy2sql.py.
|
||||
conn.paramstyle = conn.module.paramstyle
|
||||
return conn
|
||||
|
||||
-1545
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user