imdb get info

This commit is contained in:
Ruud
2012-02-05 15:01:05 +01:00
parent 8842bf2bbb
commit dfd4c2eacf
44 changed files with 33 additions and 21399 deletions
+33 -57
View File
@@ -1,81 +1,57 @@
from couchpotato.core.event import addEvent
from couchpotato.core.logger import CPLog
from couchpotato.core.providers.movie.base import MovieProvider
from imdb import IMDb
from imdb import IMDb, helpers
from imdb._logging import setLevel
import time
log = CPLog(__name__)
class IMDB(MovieProvider):
info_list = ('main', 'plot', 'release dates', 'taglines', 'synopsis')
def __init__(self):
#addEvent('movie.search', self.search)
#addEvent('movie.info', self.getInfo)
self.p = IMDb('http')
setLevel('warn')
def search(self):
print 'search'
def conf(self, option):
return self.config.get('IMDB', option)
def find(self, q, limit = 8, alternative = True):
''' Find movie by name '''
log.info('IMDB - Searching for movie: %s' % q)
def search(self, q, limit = 12):
r = self.p.search_movie(q)
return self.toResults(r, limit)
def toResults(self, r, limit = 8, one = False):
results = []
if one:
new = self.feedItem()
new.imdb = 'tt' + r.movieID
new.name = self.toSaveString(r['title'])
try:
new.year = r['year']
except:
new.year = ''
return new
else :
nr = 0
for movie in r:
results.append(self.toResults(movie, one = True))
nr += 1
if nr == limit:
break
return results
def findById(self, id):
''' Find movie by TheMovieDB ID '''
print '==' * 80
return []
def getInfo(self, identifier = None):
def findByImdbId(self, id, details = False):
''' Find movie by IMDB ID '''
m = self.p.get_movie(identifier.replace('tt', ''), info = self.info_list)
log.info('IMDB - Searching for movie: %s' % str(id))
poster = m['cover url']
poster_original = helpers.fullSizeCoverURL(m)
r = self.p.get_movie(id.replace('tt', ''))
movie_data = {
'id': identifier,
'titles': [m['title']],
'original_title': m['title'],
'rating': {
'imdb': (m.get('rating'), m.get('votes')),
},
'images': {
'poster': [poster] if poster else [],
'poster_original': [poster_original] if poster_original else [],
},
'imdb': identifier,
'runtime': m.get('runtime')[0].split(':')[1],
'released': m.get('release dates')[0].split('::')[1],
'year': m['year'],
'plot': m.get('synopsis', ''),
'tagline': m.get('taglines', '')[0],
'genres': m.get('genres', []),
}
if not details:
return self.toResults(r, one = True)
else:
self.p.update(r)
self.p.update(r, info = 'release dates')
self.p.update(r, info = 'taglines')
return r
def get_IMDb_instance(self):
return IMDb('http')
def findReleaseDate(self, movie):
pass
return movie_data
-197
View File
@@ -1,197 +0,0 @@
"""
Character module (imdb package).
This module provides the Character class, used to store information about
a given character.
Copyright 2007-2010 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
from copy import deepcopy
from imdb.utils import analyze_name, build_name, flatten, _Container, cmpPeople
class Character(_Container):
"""A Character.
Every information about a character can be accessed as:
characterObject['information']
to get a list of the kind of information stored in a
Character object, use the keys() method; some useful aliases
are defined (as "also known as" for the "akas" key);
see the keys_alias dictionary.
"""
# The default sets of information retrieved.
default_info = ('main', 'filmography', 'biography')
# Aliases for some not-so-intuitive keys.
keys_alias = {'mini biography': 'biography',
'bio': 'biography',
'character biography': 'biography',
'character biographies': 'biography',
'biographies': 'biography',
'character bio': 'biography',
'aka': 'akas',
'also known as': 'akas',
'alternate names': 'akas',
'personal quotes': 'quotes',
'keys': 'keywords',
'keyword': 'keywords'}
keys_tomodify_list = ('biography', 'quotes')
cmpFunct = cmpPeople
def _init(self, **kwds):
"""Initialize a Character object.
*characterID* -- the unique identifier for the character.
*name* -- the name of the Character, if not in the data dictionary.
*myName* -- the nickname you use for this character.
*myID* -- your personal id for this character.
*data* -- a dictionary used to initialize the object.
*notes* -- notes about the given character.
*accessSystem* -- a string representing the data access system used.
*titlesRefs* -- a dictionary with references to movies.
*namesRefs* -- a dictionary with references to persons.
*charactersRefs* -- a dictionary with references to characters.
*modFunct* -- function called returning text fields.
"""
name = kwds.get('name')
if name and not self.data.has_key('name'):
self.set_name(name)
self.characterID = kwds.get('characterID', None)
self.myName = kwds.get('myName', u'')
def _reset(self):
"""Reset the Character object."""
self.characterID = None
self.myName = u''
def set_name(self, name):
"""Set the name of the character."""
# XXX: convert name to unicode, if it's a plain string?
d = analyze_name(name, canonical=0)
self.data.update(d)
def _additional_keys(self):
"""Valid keys to append to the data.keys() list."""
addkeys = []
if self.data.has_key('name'):
addkeys += ['long imdb name']
if self.data.has_key('headshot'):
addkeys += ['full-size headshot']
return addkeys
def _getitem(self, key):
"""Handle special keys."""
## XXX: can a character have an imdbIndex?
if self.data.has_key('name'):
if key == 'long imdb name':
return build_name(self.data)
if key == 'full-size headshot' and self.data.has_key('headshot'):
return self._re_fullsizeURL.sub('', self.data.get('headshot', ''))
return None
def getID(self):
"""Return the characterID."""
return self.characterID
def __nonzero__(self):
"""The Character is "false" if the self.data does not contain a name."""
# XXX: check the name and the characterID?
if self.data.get('name'): return 1
return 0
def __contains__(self, item):
"""Return true if this Character was portrayed in the given Movie
or it was impersonated by the given Person."""
from Movie import Movie
from Person import Person
if isinstance(item, Person):
for m in flatten(self.data, yieldDictKeys=1, scalar=Movie):
if item.isSame(m.currentRole):
return 1
elif isinstance(item, Movie):
for m in flatten(self.data, yieldDictKeys=1, scalar=Movie):
if item.isSame(m):
return 1
return 0
def isSameName(self, other):
"""Return true if two character have the same name
and/or characterID."""
if not isinstance(other, self.__class__):
return 0
if self.data.has_key('name') and \
other.data.has_key('name') and \
build_name(self.data, canonical=0) == \
build_name(other.data, canonical=0):
return 1
if self.accessSystem == other.accessSystem and \
self.characterID is not None and \
self.characterID == other.characterID:
return 1
return 0
isSameCharacter = isSameName
def __deepcopy__(self, memo):
"""Return a deep copy of a Character instance."""
c = Character(name=u'', characterID=self.characterID,
myName=self.myName, myID=self.myID,
data=deepcopy(self.data, memo),
notes=self.notes, accessSystem=self.accessSystem,
titlesRefs=deepcopy(self.titlesRefs, memo),
namesRefs=deepcopy(self.namesRefs, memo),
charactersRefs=deepcopy(self.charactersRefs, memo))
c.current_info = list(self.current_info)
c.set_mod_funct(self.modFunct)
return c
def __repr__(self):
"""String representation of a Character object."""
r = '<Character id:%s[%s] name:_%s_>' % (self.characterID,
self.accessSystem,
self.get('name'))
if isinstance(r, unicode): r = r.encode('utf_8', 'replace')
return r
def __str__(self):
"""Simply print the short name."""
return self.get('name', u'').encode('utf_8', 'replace')
def __unicode__(self):
"""Simply print the short title."""
return self.get('name', u'')
def summary(self):
"""Return a string with a pretty-printed summary for the character."""
if not self: return u''
s = u'Character\n=====\nName: %s\n' % \
self.get('name', u'')
bio = self.get('biography')
if bio:
s += u'Biography: %s\n' % bio[0]
filmo = self.get('filmography')
if filmo:
a_list = [x.get('long imdb canonical title', u'')
for x in filmo[:5]]
s += u'Last movies with this character: %s.\n' % u'; '.join(a_list)
return s
-195
View File
@@ -1,195 +0,0 @@
"""
company module (imdb package).
This module provides the company class, used to store information about
a given company.
Copyright 2008-2009 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
from copy import deepcopy
from imdb.utils import analyze_company_name, build_company_name, \
flatten, _Container, cmpCompanies
class Company(_Container):
"""A company.
Every information about a company can be accessed as:
companyObject['information']
to get a list of the kind of information stored in a
company object, use the keys() method; some useful aliases
are defined (as "also known as" for the "akas" key);
see the keys_alias dictionary.
"""
# The default sets of information retrieved.
default_info = ('main',)
# Aliases for some not-so-intuitive keys.
keys_alias = {
'distributor': 'distributors',
'special effects company': 'special effects companies',
'other company': 'miscellaneous companies',
'miscellaneous company': 'miscellaneous companies',
'other companies': 'miscellaneous companies',
'misc companies': 'miscellaneous companies',
'misc company': 'miscellaneous companies',
'production company': 'production companies'}
keys_tomodify_list = ()
cmpFunct = cmpCompanies
def _init(self, **kwds):
"""Initialize a company object.
*companyID* -- the unique identifier for the company.
*name* -- the name of the company, if not in the data dictionary.
*myName* -- the nickname you use for this company.
*myID* -- your personal id for this company.
*data* -- a dictionary used to initialize the object.
*notes* -- notes about the given company.
*accessSystem* -- a string representing the data access system used.
*titlesRefs* -- a dictionary with references to movies.
*namesRefs* -- a dictionary with references to persons.
*charactersRefs* -- a dictionary with references to companies.
*modFunct* -- function called returning text fields.
"""
name = kwds.get('name')
if name and not self.data.has_key('name'):
self.set_name(name)
self.companyID = kwds.get('companyID', None)
self.myName = kwds.get('myName', u'')
def _reset(self):
"""Reset the company object."""
self.companyID = None
self.myName = u''
def set_name(self, name):
"""Set the name of the company."""
# XXX: convert name to unicode, if it's a plain string?
# Company diverges a bit from other classes, being able
# to directly handle its "notes". AND THAT'S PROBABLY A BAD IDEA!
oname = name = name.strip()
notes = u''
if name.endswith(')'):
fparidx = name.find('(')
if fparidx != -1:
notes = name[fparidx:]
name = name[:fparidx].rstrip()
if self.notes:
name = oname
d = analyze_company_name(name)
self.data.update(d)
if notes and not self.notes:
self.notes = notes
def _additional_keys(self):
"""Valid keys to append to the data.keys() list."""
if self.data.has_key('name'):
return ['long imdb name']
return []
def _getitem(self, key):
"""Handle special keys."""
## XXX: can a company have an imdbIndex?
if self.data.has_key('name'):
if key == 'long imdb name':
return build_company_name(self.data)
return None
def getID(self):
"""Return the companyID."""
return self.companyID
def __nonzero__(self):
"""The company is "false" if the self.data does not contain a name."""
# XXX: check the name and the companyID?
if self.data.get('name'): return 1
return 0
def __contains__(self, item):
"""Return true if this company and the given Movie are related."""
from Movie import Movie
if isinstance(item, Movie):
for m in flatten(self.data, yieldDictKeys=1, scalar=Movie):
if item.isSame(m):
return 1
return 0
def isSameName(self, other):
"""Return true if two company have the same name
and/or companyID."""
if not isinstance(other, self.__class__):
return 0
if self.data.has_key('name') and \
other.data.has_key('name') and \
build_company_name(self.data) == \
build_company_name(other.data):
return 1
if self.accessSystem == other.accessSystem and \
self.companyID is not None and \
self.companyID == other.companyID:
return 1
return 0
isSameCompany = isSameName
def __deepcopy__(self, memo):
"""Return a deep copy of a company instance."""
c = Company(name=u'', companyID=self.companyID,
myName=self.myName, myID=self.myID,
data=deepcopy(self.data, memo),
notes=self.notes, accessSystem=self.accessSystem,
titlesRefs=deepcopy(self.titlesRefs, memo),
namesRefs=deepcopy(self.namesRefs, memo),
charactersRefs=deepcopy(self.charactersRefs, memo))
c.current_info = list(self.current_info)
c.set_mod_funct(self.modFunct)
return c
def __repr__(self):
"""String representation of a Company object."""
r = '<Company id:%s[%s] name:_%s_>' % (self.companyID,
self.accessSystem,
self.get('long imdb name'))
if isinstance(r, unicode): r = r.encode('utf_8', 'replace')
return r
def __str__(self):
"""Simply print the short name."""
return self.get('name', u'').encode('utf_8', 'replace')
def __unicode__(self):
"""Simply print the short title."""
return self.get('name', u'')
def summary(self):
"""Return a string with a pretty-printed summary for the company."""
if not self: return u''
s = u'Company\n=======\nName: %s\n' % \
self.get('name', u'')
for k in ('distributor', 'production company', 'miscellaneous company',
'special effects company'):
d = self.get(k, [])[:5]
if not d: continue
s += u'Last movies from this company (%s): %s.\n' % \
(k, u'; '.join([x.get('long imdb title', u'') for x in d]))
return s
-398
View File
@@ -1,398 +0,0 @@
"""
Movie module (imdb package).
This module provides the Movie class, used to store information about
a given movie.
Copyright 2004-2010 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
from copy import deepcopy
from imdb import articles
from imdb.utils import analyze_title, build_title, canonicalTitle, \
flatten, _Container, cmpMovies
class Movie(_Container):
"""A Movie.
Every information about a movie can be accessed as:
movieObject['information']
to get a list of the kind of information stored in a
Movie object, use the keys() method; some useful aliases
are defined (as "casting" for the "casting director" key); see
the keys_alias dictionary.
"""
# The default sets of information retrieved.
default_info = ('main', 'plot')
# Aliases for some not-so-intuitive keys.
keys_alias = {
'tv schedule': 'airing',
'user rating': 'rating',
'plot summary': 'plot',
'plot summaries': 'plot',
'directed by': 'director',
'created by': 'creator',
'writing credits': 'writer',
'produced by': 'producer',
'original music by': 'original music',
'non-original music by': 'non-original music',
'music': 'original music',
'cinematography by': 'cinematographer',
'cinematography': 'cinematographer',
'film editing by': 'editor',
'film editing': 'editor',
'editing': 'editor',
'actors': 'cast',
'actresses': 'cast',
'casting by': 'casting director',
'casting': 'casting director',
'art direction by': 'art direction',
'set decoration by': 'set decoration',
'costume design by': 'costume designer',
'costume design': 'costume designer',
'makeup department': 'make up',
'makeup': 'make up',
'make-up': 'make up',
'production management': 'production manager',
'production company': 'production companies',
'second unit director or assistant director':
'assistant director',
'second unit director': 'assistant director',
'sound department': 'sound crew',
'costume and wardrobe department': 'costume department',
'special effects by': 'special effects',
'visual effects by': 'visual effects',
'special effects company': 'special effects companies',
'stunts': 'stunt performer',
'other crew': 'miscellaneous crew',
'misc crew': 'miscellaneous crew',
'miscellaneouscrew': 'miscellaneous crew',
'crewmembers': 'miscellaneous crew',
'crew members': 'miscellaneous crew',
'other companies': 'miscellaneous companies',
'misc companies': 'miscellaneous companies',
'miscellaneous company': 'miscellaneous companies',
'misc company': 'miscellaneous companies',
'other company': 'miscellaneous companies',
'aka': 'akas',
'also known as': 'akas',
'country': 'countries',
'production country': 'countries',
'production countries': 'countries',
'genre': 'genres',
'runtime': 'runtimes',
'lang': 'languages',
'color': 'color info',
'cover': 'cover url',
'full-size cover': 'full-size cover url',
'seasons': 'number of seasons',
'language': 'languages',
'certificate': 'certificates',
'certifications': 'certificates',
'certification': 'certificates',
'miscellaneous links': 'misc links',
'miscellaneous': 'misc links',
'soundclips': 'sound clips',
'videoclips': 'video clips',
'photographs': 'photo sites',
'distributor': 'distributors',
'distribution': 'distributors',
'distribution companies': 'distributors',
'distribution company': 'distributors',
'guest': 'guests',
'guest appearances': 'guests',
'tv guests': 'guests',
'notable tv guest appearances': 'guests',
'episodes cast': 'guests',
'episodes number': 'number of episodes',
'amazon review': 'amazon reviews',
'merchandising': 'merchandising links',
'merchandise': 'merchandising links',
'sales': 'merchandising links',
'faq': 'faqs',
'parental guide': 'parents guide',
'frequently asked questions': 'faqs'}
keys_tomodify_list = ('plot', 'trivia', 'alternate versions', 'goofs',
'quotes', 'dvd', 'laserdisc', 'news', 'soundtrack',
'crazy credits', 'business', 'supplements',
'video review', 'faqs')
cmpFunct = cmpMovies
def _init(self, **kwds):
"""Initialize a Movie object.
*movieID* -- the unique identifier for the movie.
*title* -- the title of the Movie, if not in the data dictionary.
*myTitle* -- your personal title for the movie.
*myID* -- your personal identifier for the movie.
*data* -- a dictionary used to initialize the object.
*currentRole* -- a Character instance representing the current role
or duty of a person in this movie, or a Person
object representing the actor/actress who played
a given character in a Movie. If a string is
passed, an object is automatically build.
*roleID* -- if available, the characterID/personID of the currentRole
object.
*roleIsPerson* -- when False (default) the currentRole is assumed
to be a Character object, otherwise a Person.
*notes* -- notes for the person referred in the currentRole
attribute; e.g.: '(voice)'.
*accessSystem* -- a string representing the data access system used.
*titlesRefs* -- a dictionary with references to movies.
*namesRefs* -- a dictionary with references to persons.
*charactersRefs* -- a dictionary with references to characters.
*modFunct* -- function called returning text fields.
"""
title = kwds.get('title')
if title and not self.data.has_key('title'):
self.set_title(title)
self.movieID = kwds.get('movieID', None)
self.myTitle = kwds.get('myTitle', u'')
def _reset(self):
"""Reset the Movie object."""
self.movieID = None
self.myTitle = u''
def set_title(self, title):
"""Set the title of the movie."""
# XXX: convert title to unicode, if it's a plain string?
d_title = analyze_title(title)
self.data.update(d_title)
def _additional_keys(self):
"""Valid keys to append to the data.keys() list."""
addkeys = []
if self.data.has_key('title'):
addkeys += ['canonical title', 'long imdb title',
'long imdb canonical title',
'smart canonical title',
'smart long imdb canonical title']
if self.data.has_key('episode of'):
addkeys += ['long imdb episode title', 'series title',
'canonical series title', 'episode title',
'canonical episode title',
'smart canonical series title',
'smart canonical episode title']
if self.data.has_key('cover url'):
addkeys += ['full-size cover url']
return addkeys
def guessLanguage(self):
"""Guess the language of the title of this movie; returns None
if there are no hints."""
lang = self.get('languages')
if lang:
lang = lang[0]
else:
country = self.get('countries')
if country:
lang = articles.COUNTRY_LANG.get(country[0])
return lang
def smartCanonicalTitle(self, title=None, lang=None):
"""Return the canonical title, guessing its language.
The title can be forces with the 'title' argument (internally
used) and the language can be forced with the 'lang' argument,
otherwise it's auto-detected."""
if title is None:
title = self.data.get('title', u'')
if lang is None:
lang = self.guessLanguage()
return canonicalTitle(title, lang=lang)
def _getitem(self, key):
"""Handle special keys."""
if self.data.has_key('episode of'):
if key == 'long imdb episode title':
return build_title(self.data)
elif key == 'series title':
return self.data['episode of']['title']
elif key == 'canonical series title':
ser_title = self.data['episode of']['title']
return canonicalTitle(ser_title)
elif key == 'smart canonical series title':
ser_title = self.data['episode of']['title']
return self.smartCanonicalTitle(ser_title)
elif key == 'episode title':
return self.data.get('title', u'')
elif key == 'canonical episode title':
return canonicalTitle(self.data.get('title', u''))
elif key == 'smart canonical episode title':
return self.smartCanonicalTitle(self.data.get('title', u''))
if self.data.has_key('title'):
if key == 'title':
return self.data['title']
elif key == 'long imdb title':
return build_title(self.data)
elif key == 'canonical title':
return canonicalTitle(self.data['title'])
elif key == 'smart canonical title':
return self.smartCanonicalTitle(self.data['title'])
elif key == 'long imdb canonical title':
return build_title(self.data, canonical=1)
elif key == 'smart long imdb canonical title':
return build_title(self.data, canonical=1,
lang=self.guessLanguage())
if key == 'full-size cover url' and self.data.has_key('cover url'):
return self._re_fullsizeURL.sub('', self.data.get('cover url', ''))
return None
def getID(self):
"""Return the movieID."""
return self.movieID
def __nonzero__(self):
"""The Movie is "false" if the self.data does not contain a title."""
# XXX: check the title and the movieID?
if self.data.has_key('title'): return 1
return 0
def isSameTitle(self, other):
"""Return true if this and the compared object have the same
long imdb title and/or movieID.
"""
# XXX: obsolete?
if not isinstance(other, self.__class__): return 0
if self.data.has_key('title') and \
other.data.has_key('title') and \
build_title(self.data, canonical=0) == \
build_title(other.data, canonical=0):
return 1
if self.accessSystem == other.accessSystem and \
self.movieID is not None and self.movieID == other.movieID:
return 1
return 0
isSameMovie = isSameTitle # XXX: just for backward compatiblity.
def __contains__(self, item):
"""Return true if the given Person object is listed in this Movie,
or if the the given Character is represented in this Movie."""
from Person import Person
from Character import Character
from Company import Company
if isinstance(item, Person):
for p in flatten(self.data, yieldDictKeys=1, scalar=Person,
toDescend=(list, dict, tuple, Movie)):
if item.isSame(p):
return 1
elif isinstance(item, Character):
for p in flatten(self.data, yieldDictKeys=1, scalar=Person,
toDescend=(list, dict, tuple, Movie)):
if item.isSame(p.currentRole):
return 1
elif isinstance(item, Company):
for c in flatten(self.data, yieldDictKeys=1, scalar=Company,
toDescend=(list, dict, tuple, Movie)):
if item.isSame(c):
return 1
return 0
def __deepcopy__(self, memo):
"""Return a deep copy of a Movie instance."""
m = Movie(title=u'', movieID=self.movieID, myTitle=self.myTitle,
myID=self.myID, data=deepcopy(self.data, memo),
currentRole=deepcopy(self.currentRole, memo),
roleIsPerson=self._roleIsPerson,
notes=self.notes, accessSystem=self.accessSystem,
titlesRefs=deepcopy(self.titlesRefs, memo),
namesRefs=deepcopy(self.namesRefs, memo),
charactersRefs=deepcopy(self.charactersRefs, memo))
m.current_info = list(self.current_info)
m.set_mod_funct(self.modFunct)
return m
def __repr__(self):
"""String representation of a Movie object."""
# XXX: add also currentRole and notes, if present?
if self.has_key('long imdb episode title'):
title = self.get('long imdb episode title')
else:
title = self.get('long imdb title')
r = '<Movie id:%s[%s] title:_%s_>' % (self.movieID, self.accessSystem,
title)
if isinstance(r, unicode): r = r.encode('utf_8', 'replace')
return r
def __str__(self):
"""Simply print the short title."""
return self.get('title', u'').encode('utf_8', 'replace')
def __unicode__(self):
"""Simply print the short title."""
return self.get('title', u'')
def summary(self):
"""Return a string with a pretty-printed summary for the movie."""
if not self: return u''
def _nameAndRole(personList, joiner=u', '):
"""Build a pretty string with name and role."""
nl = []
for person in personList:
n = person.get('name', u'')
if person.currentRole: n += u' (%s)' % person.currentRole
nl.append(n)
return joiner.join(nl)
s = u'Movie\n=====\nTitle: %s\n' % \
self.get('long imdb canonical title', u'')
genres = self.get('genres')
if genres: s += u'Genres: %s.\n' % u', '.join(genres)
director = self.get('director')
if director:
s += u'Director: %s.\n' % _nameAndRole(director)
writer = self.get('writer')
if writer:
s += u'Writer: %s.\n' % _nameAndRole(writer)
cast = self.get('cast')
if cast:
cast = cast[:5]
s += u'Cast: %s.\n' % _nameAndRole(cast)
runtime = self.get('runtimes')
if runtime:
s += u'Runtime: %s.\n' % u', '.join(runtime)
countries = self.get('countries')
if countries:
s += u'Country: %s.\n' % u', '.join(countries)
lang = self.get('languages')
if lang:
s += u'Language: %s.\n' % u', '.join(lang)
rating = self.get('rating')
if rating:
s += u'Rating: %s' % rating
nr_votes = self.get('votes')
if nr_votes:
s += u' (%s votes)' % nr_votes
s += u'.\n'
plot = self.get('plot')
if not plot:
plot = self.get('plot summary')
if plot:
plot = [plot]
if plot:
plot = plot[0]
i = plot.find('::')
if i != -1:
plot = plot[:i]
s += u'Plot: %s' % plot
return s
-275
View File
@@ -1,275 +0,0 @@
"""
Person module (imdb package).
This module provides the Person class, used to store information about
a given person.
Copyright 2004-2010 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
from copy import deepcopy
from imdb.utils import analyze_name, build_name, normalizeName, \
flatten, _Container, cmpPeople
class Person(_Container):
"""A Person.
Every information about a person can be accessed as:
personObject['information']
to get a list of the kind of information stored in a
Person object, use the keys() method; some useful aliases
are defined (as "biography" for the "mini biography" key);
see the keys_alias dictionary.
"""
# The default sets of information retrieved.
default_info = ('main', 'filmography', 'biography')
# Aliases for some not-so-intuitive keys.
keys_alias = {'biography': 'mini biography',
'bio': 'mini biography',
'aka': 'akas',
'also known as': 'akas',
'nick name': 'nick names',
'nicks': 'nick names',
'nickname': 'nick names',
'miscellaneouscrew': 'miscellaneous crew',
'crewmembers': 'miscellaneous crew',
'misc': 'miscellaneous crew',
'guest': 'notable tv guest appearances',
'guests': 'notable tv guest appearances',
'tv guest': 'notable tv guest appearances',
'guest appearances': 'notable tv guest appearances',
'spouses': 'spouse',
'salary': 'salary history',
'salaries': 'salary history',
'otherworks': 'other works',
"maltin's biography":
"biography from leonard maltin's movie encyclopedia",
"leonard maltin's biography":
"biography from leonard maltin's movie encyclopedia",
'real name': 'birth name',
'where are they now': 'where now',
'personal quotes': 'quotes',
'mini-biography author': 'imdb mini-biography by',
'biography author': 'imdb mini-biography by',
'genre': 'genres',
'portrayed': 'portrayed in',
'keys': 'keywords',
'trademarks': 'trade mark',
'trade mark': 'trade mark',
'trade marks': 'trade mark',
'trademark': 'trade mark',
'pictorials': 'pictorial',
'magazine covers': 'magazine cover photo',
'magazine-covers': 'magazine cover photo',
'tv series episodes': 'episodes',
'tv-series episodes': 'episodes',
'articles': 'article',
'keyword': 'keywords'}
# 'nick names'???
keys_tomodify_list = ('mini biography', 'spouse', 'quotes', 'other works',
'salary history', 'trivia', 'trade mark', 'news',
'books', 'biographical movies', 'portrayed in',
'where now', 'interviews', 'article',
"biography from leonard maltin's movie encyclopedia")
cmpFunct = cmpPeople
def _init(self, **kwds):
"""Initialize a Person object.
*personID* -- the unique identifier for the person.
*name* -- the name of the Person, if not in the data dictionary.
*myName* -- the nickname you use for this person.
*myID* -- your personal id for this person.
*data* -- a dictionary used to initialize the object.
*currentRole* -- a Character instance representing the current role
or duty of a person in this movie, or a Person
object representing the actor/actress who played
a given character in a Movie. If a string is
passed, an object is automatically build.
*roleID* -- if available, the characterID/personID of the currentRole
object.
*roleIsPerson* -- when False (default) the currentRole is assumed
to be a Character object, otherwise a Person.
*notes* -- notes about the given person for a specific movie
or role (e.g.: the alias used in the movie credits).
*accessSystem* -- a string representing the data access system used.
*titlesRefs* -- a dictionary with references to movies.
*namesRefs* -- a dictionary with references to persons.
*modFunct* -- function called returning text fields.
*billingPos* -- position of this person in the credits list.
"""
name = kwds.get('name')
if name and not self.data.has_key('name'):
self.set_name(name)
self.personID = kwds.get('personID', None)
self.myName = kwds.get('myName', u'')
self.billingPos = kwds.get('billingPos', None)
def _reset(self):
"""Reset the Person object."""
self.personID = None
self.myName = u''
self.billingPos = None
def _clear(self):
"""Reset the dictionary."""
self.billingPos = None
def set_name(self, name):
"""Set the name of the person."""
# XXX: convert name to unicode, if it's a plain string?
d = analyze_name(name, canonical=1)
self.data.update(d)
def _additional_keys(self):
"""Valid keys to append to the data.keys() list."""
addkeys = []
if self.data.has_key('name'):
addkeys += ['canonical name', 'long imdb name',
'long imdb canonical name']
if self.data.has_key('headshot'):
addkeys += ['full-size headshot']
return addkeys
def _getitem(self, key):
"""Handle special keys."""
if self.data.has_key('name'):
if key == 'name':
return normalizeName(self.data['name'])
elif key == 'canonical name':
return self.data['name']
elif key == 'long imdb name':
return build_name(self.data, canonical=0)
elif key == 'long imdb canonical name':
return build_name(self.data)
if key == 'full-size headshot' and self.data.has_key('headshot'):
return self._re_fullsizeURL.sub('', self.data.get('headshot', ''))
return None
def getID(self):
"""Return the personID."""
return self.personID
def __nonzero__(self):
"""The Person is "false" if the self.data does not contain a name."""
# XXX: check the name and the personID?
if self.data.has_key('name'): return 1
return 0
def __contains__(self, item):
"""Return true if this Person has worked in the given Movie,
or if the fiven Character was played by this Person."""
from Movie import Movie
from Character import Character
if isinstance(item, Movie):
for m in flatten(self.data, yieldDictKeys=1, scalar=Movie):
if item.isSame(m):
return 1
elif isinstance(item, Character):
for m in flatten(self.data, yieldDictKeys=1, scalar=Movie):
if item.isSame(m.currentRole):
return 1
return 0
def isSameName(self, other):
"""Return true if two persons have the same name and imdbIndex
and/or personID.
"""
if not isinstance(other, self.__class__):
return 0
if self.data.has_key('name') and \
other.data.has_key('name') and \
build_name(self.data, canonical=1) == \
build_name(other.data, canonical=1):
return 1
if self.accessSystem == other.accessSystem and \
self.personID and self.personID == other.personID:
return 1
return 0
isSamePerson = isSameName # XXX: just for backward compatiblity.
def __deepcopy__(self, memo):
"""Return a deep copy of a Person instance."""
p = Person(name=u'', personID=self.personID, myName=self.myName,
myID=self.myID, data=deepcopy(self.data, memo),
currentRole=deepcopy(self.currentRole, memo),
roleIsPerson=self._roleIsPerson,
notes=self.notes, accessSystem=self.accessSystem,
titlesRefs=deepcopy(self.titlesRefs, memo),
namesRefs=deepcopy(self.namesRefs, memo),
charactersRefs=deepcopy(self.charactersRefs, memo))
p.current_info = list(self.current_info)
p.set_mod_funct(self.modFunct)
p.billingPos = self.billingPos
return p
def __repr__(self):
"""String representation of a Person object."""
# XXX: add also currentRole and notes, if present?
r = '<Person id:%s[%s] name:_%s_>' % (self.personID, self.accessSystem,
self.get('long imdb canonical name'))
if isinstance(r, unicode): r = r.encode('utf_8', 'replace')
return r
def __str__(self):
"""Simply print the short name."""
return self.get('name', u'').encode('utf_8', 'replace')
def __unicode__(self):
"""Simply print the short title."""
return self.get('name', u'')
def summary(self):
"""Return a string with a pretty-printed summary for the person."""
if not self: return u''
s = u'Person\n=====\nName: %s\n' % \
self.get('long imdb canonical name', u'')
bdate = self.get('birth date')
if bdate:
s += u'Birth date: %s' % bdate
bnotes = self.get('birth notes')
if bnotes:
s += u' (%s)' % bnotes
s += u'.\n'
ddate = self.get('death date')
if ddate:
s += u'Death date: %s' % ddate
dnotes = self.get('death notes')
if dnotes:
s += u' (%s)' % dnotes
s += u'.\n'
bio = self.get('mini biography')
if bio:
s += u'Biography: %s\n' % bio[0]
director = self.get('director')
if director:
d_list = [x.get('long imdb canonical title', u'')
for x in director[:3]]
s += u'Last movies directed: %s.\n' % u'; '.join(d_list)
act = self.get('actor') or self.get('actress')
if act:
a_list = [x.get('long imdb canonical title', u'')
for x in act[:5]]
s += u'Last movies acted: %s.\n' % u'; '.join(a_list)
return s
-907
View File
@@ -1,907 +0,0 @@
"""
imdb package.
This package can be used to retrieve information about a movie or
a person from the IMDb database.
It can fetch data through different media (e.g.: the IMDb web pages,
a SQL database, etc.)
Copyright 2004-2010 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
__all__ = ['IMDb', 'IMDbError', 'Movie', 'Person', 'Character', 'Company',
'available_access_systems']
__version__ = VERSION = '4.8dev20110303'
# Import compatibility module (importing it is enough).
import _compat
import sys, os, ConfigParser, logging
from types import MethodType
from imdb import Movie, Person, Character, Company
import imdb._logging
from imdb._exceptions import IMDbError, IMDbDataAccessError
from imdb.utils import build_title, build_name, build_company_name
_aux_logger = logging.getLogger('imdbpy.aux')
# URLs of the main pages for movies, persons, characters and queries.
imdbURL_base = 'http://akas.imdb.com/'
# http://akas.imdb.com/title/
imdbURL_movie_base = '%stitle/' % imdbURL_base
# http://akas.imdb.com/title/tt%s/
imdbURL_movie_main = imdbURL_movie_base + 'tt%s/'
# http://akas.imdb.com/name/
imdbURL_person_base = '%sname/' % imdbURL_base
# http://akas.imdb.com/name/nm%s/
imdbURL_person_main = imdbURL_person_base + 'nm%s/'
# http://akas.imdb.com/character/
imdbURL_character_base = '%scharacter/' % imdbURL_base
# http://akas.imdb.com/character/ch%s/
imdbURL_character_main = imdbURL_character_base + 'ch%s/'
# http://akas.imdb.com/company/
imdbURL_company_base = '%scompany/' % imdbURL_base
# http://akas.imdb.com/company/co%s/
imdbURL_company_main = imdbURL_company_base + 'co%s/'
# http://akas.imdb.com/keyword/%s/
imdbURL_keyword_main = imdbURL_base + 'keyword/%s/'
# http://akas.imdb.com/chart/top
imdbURL_top250 = imdbURL_base + 'chart/top'
# http://akas.imdb.com/chart/bottom
imdbURL_bottom100 = imdbURL_base + 'chart/bottom'
# http://akas.imdb.com/find?%s
imdbURL_find = imdbURL_base + 'find?%s'
# Name of the configuration file.
confFileName = 'imdbpy.cfg'
class ConfigParserWithCase(ConfigParser.ConfigParser):
"""A case-sensitive parser for configuration files."""
def __init__(self, defaults=None, confFile=None, *args, **kwds):
"""Initialize the parser.
*defaults* -- defaults values.
*confFile* -- the file (or list of files) to parse."""
ConfigParser.ConfigParser.__init__(self, defaults=defaults)
if confFile is None:
dotFileName = '.' + confFileName
# Current and home directory.
confFile = [os.path.join(os.getcwd(), confFileName),
os.path.join(os.getcwd(), dotFileName),
os.path.join(os.path.expanduser('~'), confFileName),
os.path.join(os.path.expanduser('~'), dotFileName)]
if os.name == 'posix':
sep = getattr(os.path, 'sep', '/')
# /etc/ and /etc/conf.d/
confFile.append(os.path.join(sep, 'etc', confFileName))
confFile.append(os.path.join(sep, 'etc', 'conf.d',
confFileName))
else:
# etc subdirectory of sys.prefix, for non-unix systems.
confFile.append(os.path.join(sys.prefix, 'etc', confFileName))
for fname in confFile:
try:
self.read(fname)
except (ConfigParser.MissingSectionHeaderError,
ConfigParser.ParsingError), e:
_aux_logger.warn('Troubles reading config file: %s' % e)
# Stop at the first valid file.
if self.has_section('imdbpy'):
break
def optionxform(self, optionstr):
"""Option names are case sensitive."""
return optionstr
def _manageValue(self, value):
"""Custom substitutions for values."""
if not isinstance(value, (str, unicode)):
return value
vlower = value.lower()
if vlower in self._boolean_states:
return self._boolean_states[vlower]
elif vlower == 'none':
return None
return value
def get(self, section, option, *args, **kwds):
"""Return the value of an option from a given section."""
value = ConfigParser.ConfigParser.get(self, section, option,
*args, **kwds)
return self._manageValue(value)
def items(self, section, *args, **kwds):
"""Return a list of (key, value) tuples of items of the
given section."""
if section != 'DEFAULT' and not self.has_section(section):
return []
keys = ConfigParser.ConfigParser.options(self, section)
return [(k, self.get(section, k, *args, **kwds)) for k in keys]
def getDict(self, section):
"""Return a dictionary of items of the specified section."""
return dict(self.items(section))
def IMDb(accessSystem=None, *arguments, **keywords):
"""Return an instance of the appropriate class.
The accessSystem parameter is used to specify the kind of
the preferred access system."""
if accessSystem is None or accessSystem in ('auto', 'config'):
try:
cfg_file = ConfigParserWithCase(*arguments, **keywords)
# Parameters set by the code take precedence.
kwds = cfg_file.getDict('imdbpy')
if 'accessSystem' in kwds:
accessSystem = kwds['accessSystem']
del kwds['accessSystem']
else:
accessSystem = 'http'
kwds.update(keywords)
keywords = kwds
except Exception, e:
import logging
logging.getLogger('imdbpy').warn('Unable to read configuration' \
' file; complete error: %s' % e)
# It just LOOKS LIKE a bad habit: we tried to read config
# options from some files, but something is gone horribly
# wrong: ignore everything and pretend we were called with
# the 'http' accessSystem.
accessSystem = 'http'
if 'loggingLevel' in keywords:
imdb._logging.setLevel(keywords['loggingLevel'])
del keywords['loggingLevel']
if 'loggingConfig' in keywords:
logCfg = keywords['loggingConfig']
del keywords['loggingConfig']
try:
import logging.config
logging.config.fileConfig(os.path.expanduser(logCfg))
except Exception, e:
logging.getLogger('imdbpy').warn('unable to read logger ' \
'config: %s' % e)
if accessSystem in ('http', 'web', 'html'):
from parser.http import IMDbHTTPAccessSystem
return IMDbHTTPAccessSystem(*arguments, **keywords)
elif accessSystem in ('httpThin', 'webThin', 'htmlThin'):
import logging
logging.warn('httpThin is badly broken and' \
' will not be fixed; please switch' \
' to "http" or "mobile"')
from parser.http import IMDbHTTPAccessSystem
return IMDbHTTPAccessSystem(isThin=1, *arguments, **keywords)
elif accessSystem in ('mobile',):
from parser.mobile import IMDbMobileAccessSystem
return IMDbMobileAccessSystem(*arguments, **keywords)
elif accessSystem in ('local', 'files'):
# The local access system was removed since IMDbPY 4.2.
raise IMDbError, 'the local access system was removed since IMDbPY 4.2'
elif accessSystem in ('sql', 'db', 'database'):
try:
from parser.sql import IMDbSqlAccessSystem
except ImportError:
raise IMDbError, 'the sql access system is not installed'
return IMDbSqlAccessSystem(*arguments, **keywords)
else:
raise IMDbError, 'unknown kind of data access system: "%s"' \
% accessSystem
def available_access_systems():
"""Return the list of available data access systems."""
asList = []
# XXX: trying to import modules is a good thing?
try:
from parser.http import IMDbHTTPAccessSystem
asList += ['http', 'httpThin']
except ImportError:
pass
try:
from parser.mobile import IMDbMobileAccessSystem
asList.append('mobile')
except ImportError:
pass
try:
from parser.sql import IMDbSqlAccessSystem
asList.append('sql')
except ImportError:
pass
return asList
# XXX: I'm not sure this is a good guess.
# I suppose that an argument of the IMDb function can be used to
# set a default encoding for the output, and then Movie, Person and
# Character objects can use this default encoding, returning strings.
# Anyway, passing unicode strings to search_movie(), search_person()
# and search_character() methods is always safer.
encoding = getattr(sys.stdin, 'encoding', '') or sys.getdefaultencoding()
class IMDbBase:
"""The base class used to search for a movie/person/character and
to get a Movie/Person/Character object.
This class cannot directly fetch data of any kind and so you
have to search the "real" code into a subclass."""
# The name of the preferred access system (MUST be overridden
# in the subclasses).
accessSystem = 'UNKNOWN'
# Top-level logger for IMDbPY.
_imdb_logger = logging.getLogger('imdbpy')
def __init__(self, defaultModFunct=None, results=20, keywordsResults=100,
*arguments, **keywords):
"""Initialize the access system.
If specified, defaultModFunct is the function used by
default by the Person, Movie and Character objects, when
accessing their text fields.
"""
# The function used to output the strings that need modification (the
# ones containing references to movie titles and person names).
self._defModFunct = defaultModFunct
# Number of results to get.
try:
results = int(results)
except (TypeError, ValueError):
results = 20
if results < 1:
results = 20
self._results = results
try:
keywordsResults = int(keywordsResults)
except (TypeError, ValueError):
keywordsResults = 100
if keywordsResults < 1:
keywordsResults = 100
self._keywordsResults = keywordsResults
def _normalize_movieID(self, movieID):
"""Normalize the given movieID."""
# By default, do nothing.
return movieID
def _normalize_personID(self, personID):
"""Normalize the given personID."""
# By default, do nothing.
return personID
def _normalize_characterID(self, characterID):
"""Normalize the given characterID."""
# By default, do nothing.
return characterID
def _normalize_companyID(self, companyID):
"""Normalize the given companyID."""
# By default, do nothing.
return companyID
def _get_real_movieID(self, movieID):
"""Handle title aliases."""
# By default, do nothing.
return movieID
def _get_real_personID(self, personID):
"""Handle name aliases."""
# By default, do nothing.
return personID
def _get_real_characterID(self, characterID):
"""Handle character name aliases."""
# By default, do nothing.
return characterID
def _get_real_companyID(self, companyID):
"""Handle company name aliases."""
# By default, do nothing.
return companyID
def _get_infoset(self, prefname):
"""Return methods with the name starting with prefname."""
infoset = []
excludes = ('%sinfoset' % prefname,)
preflen = len(prefname)
for name in dir(self.__class__):
if name.startswith(prefname) and name not in excludes:
member = getattr(self.__class__, name)
if isinstance(member, MethodType):
infoset.append(name[preflen:].replace('_', ' '))
return infoset
def get_movie_infoset(self):
"""Return the list of info set available for movies."""
return self._get_infoset('get_movie_')
def get_person_infoset(self):
"""Return the list of info set available for persons."""
return self._get_infoset('get_person_')
def get_character_infoset(self):
"""Return the list of info set available for characters."""
return self._get_infoset('get_character_')
def get_company_infoset(self):
"""Return the list of info set available for companies."""
return self._get_infoset('get_company_')
def get_movie(self, movieID, info=Movie.Movie.default_info, modFunct=None):
"""Return a Movie object for the given movieID.
The movieID is something used to univocally identify a movie;
it can be the imdbID used by the IMDb web server, a file
pointer, a line number in a file, an ID in a database, etc.
info is the list of sets of information to retrieve.
If specified, modFunct will be the function used by the Movie
object when accessing its text fields (like 'plot')."""
movieID = self._normalize_movieID(movieID)
movieID = self._get_real_movieID(movieID)
movie = Movie.Movie(movieID=movieID, accessSystem=self.accessSystem)
modFunct = modFunct or self._defModFunct
if modFunct is not None:
movie.set_mod_funct(modFunct)
self.update(movie, info)
return movie
get_episode = get_movie
def _search_movie(self, title, results):
"""Return a list of tuples (movieID, {movieData})"""
# XXX: for the real implementation, see the method of the
# subclass, somewhere under the imdb.parser package.
raise NotImplementedError, 'override this method'
def search_movie(self, title, results=None, _episodes=False):
"""Return a list of Movie objects for a query for the given title.
The results argument is the maximum number of results to return."""
if results is None:
results = self._results
try:
results = int(results)
except (ValueError, OverflowError):
results = 20
# XXX: I suppose it will be much safer if the user provides
# an unicode string... this is just a guess.
if not isinstance(title, unicode):
title = unicode(title, encoding, 'replace')
if not _episodes:
res = self._search_movie(title, results)
else:
res = self._search_episode(title, results)
return [Movie.Movie(movieID=self._get_real_movieID(mi),
data=md, modFunct=self._defModFunct,
accessSystem=self.accessSystem) for mi, md in res][:results]
def _search_episode(self, title, results):
"""Return a list of tuples (movieID, {movieData})"""
# XXX: for the real implementation, see the method of the
# subclass, somewhere under the imdb.parser package.
raise NotImplementedError, 'override this method'
def search_episode(self, title, results=None):
"""Return a list of Movie objects for a query for the given title.
The results argument is the maximum number of results to return;
this method searches only for titles of tv (mini) series' episodes."""
return self.search_movie(title, results=results, _episodes=True)
def get_person(self, personID, info=Person.Person.default_info,
modFunct=None):
"""Return a Person object for the given personID.
The personID is something used to univocally identify a person;
it can be the imdbID used by the IMDb web server, a file
pointer, a line number in a file, an ID in a database, etc.
info is the list of sets of information to retrieve.
If specified, modFunct will be the function used by the Person
object when accessing its text fields (like 'mini biography')."""
personID = self._normalize_personID(personID)
personID = self._get_real_personID(personID)
person = Person.Person(personID=personID,
accessSystem=self.accessSystem)
modFunct = modFunct or self._defModFunct
if modFunct is not None:
person.set_mod_funct(modFunct)
self.update(person, info)
return person
def _search_person(self, name, results):
"""Return a list of tuples (personID, {personData})"""
# XXX: for the real implementation, see the method of the
# subclass, somewhere under the imdb.parser package.
raise NotImplementedError, 'override this method'
def search_person(self, name, results=None):
"""Return a list of Person objects for a query for the given name.
The results argument is the maximum number of results to return."""
if results is None:
results = self._results
try:
results = int(results)
except (ValueError, OverflowError):
results = 20
if not isinstance(name, unicode):
name = unicode(name, encoding, 'replace')
res = self._search_person(name, results)
return [Person.Person(personID=self._get_real_personID(pi),
data=pd, modFunct=self._defModFunct,
accessSystem=self.accessSystem) for pi, pd in res][:results]
def get_character(self, characterID, info=Character.Character.default_info,
modFunct=None):
"""Return a Character object for the given characterID.
The characterID is something used to univocally identify a character;
it can be the imdbID used by the IMDb web server, a file
pointer, a line number in a file, an ID in a database, etc.
info is the list of sets of information to retrieve.
If specified, modFunct will be the function used by the Character
object when accessing its text fields (like 'biography')."""
characterID = self._normalize_characterID(characterID)
characterID = self._get_real_characterID(characterID)
character = Character.Character(characterID=characterID,
accessSystem=self.accessSystem)
modFunct = modFunct or self._defModFunct
if modFunct is not None:
character.set_mod_funct(modFunct)
self.update(character, info)
return character
def _search_character(self, name, results):
"""Return a list of tuples (characterID, {characterData})"""
# XXX: for the real implementation, see the method of the
# subclass, somewhere under the imdb.parser package.
raise NotImplementedError, 'override this method'
def search_character(self, name, results=None):
"""Return a list of Character objects for a query for the given name.
The results argument is the maximum number of results to return."""
if results is None:
results = self._results
try:
results = int(results)
except (ValueError, OverflowError):
results = 20
if not isinstance(name, unicode):
name = unicode(name, encoding, 'replace')
res = self._search_character(name, results)
return [Character.Character(characterID=self._get_real_characterID(pi),
data=pd, modFunct=self._defModFunct,
accessSystem=self.accessSystem) for pi, pd in res][:results]
def get_company(self, companyID, info=Company.Company.default_info,
modFunct=None):
"""Return a Company object for the given companyID.
The companyID is something used to univocally identify a company;
it can be the imdbID used by the IMDb web server, a file
pointer, a line number in a file, an ID in a database, etc.
info is the list of sets of information to retrieve.
If specified, modFunct will be the function used by the Company
object when accessing its text fields (none, so far)."""
companyID = self._normalize_companyID(companyID)
companyID = self._get_real_companyID(companyID)
company = Company.Company(companyID=companyID,
accessSystem=self.accessSystem)
modFunct = modFunct or self._defModFunct
if modFunct is not None:
company.set_mod_funct(modFunct)
self.update(company, info)
return company
def _search_company(self, name, results):
"""Return a list of tuples (companyID, {companyData})"""
# XXX: for the real implementation, see the method of the
# subclass, somewhere under the imdb.parser package.
raise NotImplementedError, 'override this method'
def search_company(self, name, results=None):
"""Return a list of Company objects for a query for the given name.
The results argument is the maximum number of results to return."""
if results is None:
results = self._results
try:
results = int(results)
except (ValueError, OverflowError):
results = 20
if not isinstance(name, unicode):
name = unicode(name, encoding, 'replace')
res = self._search_company(name, results)
return [Company.Company(companyID=self._get_real_companyID(pi),
data=pd, modFunct=self._defModFunct,
accessSystem=self.accessSystem) for pi, pd in res][:results]
def _search_keyword(self, keyword, results):
"""Return a list of 'keyword' strings."""
# XXX: for the real implementation, see the method of the
# subclass, somewhere under the imdb.parser package.
raise NotImplementedError, 'override this method'
def search_keyword(self, keyword, results=None):
"""Search for existing keywords, similar to the given one."""
if results is None:
results = self._keywordsResults
try:
results = int(results)
except (ValueError, OverflowError):
results = 100
if not isinstance(keyword, unicode):
keyword = unicode(keyword, encoding, 'replace')
return self._search_keyword(keyword, results)
def _get_keyword(self, keyword, results):
"""Return a list of tuples (movieID, {movieData})"""
# XXX: for the real implementation, see the method of the
# subclass, somewhere under the imdb.parser package.
raise NotImplementedError, 'override this method'
def get_keyword(self, keyword, results=None):
"""Return a list of movies for the given keyword."""
if results is None:
results = self._keywordsResults
try:
results = int(results)
except (ValueError, OverflowError):
results = 100
# XXX: I suppose it will be much safer if the user provides
# an unicode string... this is just a guess.
if not isinstance(keyword, unicode):
keyword = unicode(keyword, encoding, 'replace')
res = self._get_keyword(keyword, results)
return [Movie.Movie(movieID=self._get_real_movieID(mi),
data=md, modFunct=self._defModFunct,
accessSystem=self.accessSystem) for mi, md in res][:results]
def _get_top_bottom_movies(self, kind):
"""Return the list of the top 250 or bottom 100 movies."""
# XXX: for the real implementation, see the method of the
# subclass, somewhere under the imdb.parser package.
# This method must return a list of (movieID, {movieDict})
# tuples. The kind parameter can be 'top' or 'bottom'.
raise NotImplementedError, 'override this method'
def get_top250_movies(self):
"""Return the list of the top 250 movies."""
res = self._get_top_bottom_movies('top')
return [Movie.Movie(movieID=self._get_real_movieID(mi),
data=md, modFunct=self._defModFunct,
accessSystem=self.accessSystem) for mi, md in res]
def get_bottom100_movies(self):
"""Return the list of the bottom 100 movies."""
res = self._get_top_bottom_movies('bottom')
return [Movie.Movie(movieID=self._get_real_movieID(mi),
data=md, modFunct=self._defModFunct,
accessSystem=self.accessSystem) for mi, md in res]
def new_movie(self, *arguments, **keywords):
"""Return a Movie object."""
# XXX: not really useful...
if 'title' in keywords:
if not isinstance(keywords['title'], unicode):
keywords['title'] = unicode(keywords['title'],
encoding, 'replace')
elif len(arguments) > 1:
if not isinstance(arguments[1], unicode):
arguments[1] = unicode(arguments[1], encoding, 'replace')
return Movie.Movie(accessSystem=self.accessSystem,
*arguments, **keywords)
def new_person(self, *arguments, **keywords):
"""Return a Person object."""
# XXX: not really useful...
if 'name' in keywords:
if not isinstance(keywords['name'], unicode):
keywords['name'] = unicode(keywords['name'],
encoding, 'replace')
elif len(arguments) > 1:
if not isinstance(arguments[1], unicode):
arguments[1] = unicode(arguments[1], encoding, 'replace')
return Person.Person(accessSystem=self.accessSystem,
*arguments, **keywords)
def new_character(self, *arguments, **keywords):
"""Return a Character object."""
# XXX: not really useful...
if 'name' in keywords:
if not isinstance(keywords['name'], unicode):
keywords['name'] = unicode(keywords['name'],
encoding, 'replace')
elif len(arguments) > 1:
if not isinstance(arguments[1], unicode):
arguments[1] = unicode(arguments[1], encoding, 'replace')
return Character.Character(accessSystem=self.accessSystem,
*arguments, **keywords)
def new_company(self, *arguments, **keywords):
"""Return a Company object."""
# XXX: not really useful...
if 'name' in keywords:
if not isinstance(keywords['name'], unicode):
keywords['name'] = unicode(keywords['name'],
encoding, 'replace')
elif len(arguments) > 1:
if not isinstance(arguments[1], unicode):
arguments[1] = unicode(arguments[1], encoding, 'replace')
return Company.Company(accessSystem=self.accessSystem,
*arguments, **keywords)
def update(self, mop, info=None, override=0):
"""Given a Movie, Person, Character or Company object with only
partial information, retrieve the required set of information.
info is the list of sets of information to retrieve.
If override is set, the information are retrieved and updated
even if they're already in the object."""
# XXX: should this be a method of the Movie/Person/Character/Company
# classes? NO! What for instances created by external functions?
mopID = None
prefix = ''
if isinstance(mop, Movie.Movie):
mopID = mop.movieID
prefix = 'movie'
elif isinstance(mop, Person.Person):
mopID = mop.personID
prefix = 'person'
elif isinstance(mop, Character.Character):
mopID = mop.characterID
prefix = 'character'
elif isinstance(mop, Company.Company):
mopID = mop.companyID
prefix = 'company'
else:
raise IMDbError, 'object ' + repr(mop) + \
' is not a Movie, Person, Character or Company instance'
if mopID is None:
# XXX: enough? It's obvious that there are Characters
# objects without characterID, so I think they should
# just do nothing, when an i.update(character) is tried.
if prefix == 'character':
return
raise IMDbDataAccessError, \
'the supplied object has null movieID, personID or companyID'
if mop.accessSystem == self.accessSystem:
aSystem = self
else:
aSystem = IMDb(mop.accessSystem)
if info is None:
info = mop.default_info
elif info == 'all':
if isinstance(mop, Movie.Movie):
info = self.get_movie_infoset()
elif isinstance(mop, Person.Person):
info = self.get_person_infoset()
elif isinstance(mop, Character.Character):
info = self.get_character_infoset()
else:
info = self.get_company_infoset()
if not isinstance(info, (tuple, list)):
info = (info,)
res = {}
for i in info:
if i in mop.current_info and not override:
continue
if not i:
continue
self._imdb_logger.debug('retrieving "%s" info set', i)
try:
method = getattr(aSystem, 'get_%s_%s' %
(prefix, i.replace(' ', '_')))
except AttributeError:
self._imdb_logger.error('unknown information set "%s"', i)
# Keeps going.
method = lambda *x: {}
try:
ret = method(mopID)
except Exception, e:
self._imdb_logger.critical('caught an exception retrieving ' \
'or parsing "%s" info set for mopID ' \
'"%s" (accessSystem: %s)',
i, mopID, mop.accessSystem, exc_info=True)
ret = {}
keys = None
if 'data' in ret:
res.update(ret['data'])
if isinstance(ret['data'], dict):
keys = ret['data'].keys()
if 'info sets' in ret:
for ri in ret['info sets']:
mop.add_to_current_info(ri, keys, mainInfoset=i)
else:
mop.add_to_current_info(i, keys)
if 'titlesRefs' in ret:
mop.update_titlesRefs(ret['titlesRefs'])
if 'namesRefs' in ret:
mop.update_namesRefs(ret['namesRefs'])
if 'charactersRefs' in ret:
mop.update_charactersRefs(ret['charactersRefs'])
mop.set_data(res, override=0)
def get_imdbMovieID(self, movieID):
"""Translate a movieID in an imdbID (the ID used by the IMDb
web server); must be overridden by the subclass."""
# XXX: for the real implementation, see the method of the
# subclass, somewhere under the imdb.parser package.
raise NotImplementedError, 'override this method'
def get_imdbPersonID(self, personID):
"""Translate a personID in a imdbID (the ID used by the IMDb
web server); must be overridden by the subclass."""
# XXX: for the real implementation, see the method of the
# subclass, somewhere under the imdb.parser package.
raise NotImplementedError, 'override this method'
def get_imdbCharacterID(self, characterID):
"""Translate a characterID in a imdbID (the ID used by the IMDb
web server); must be overridden by the subclass."""
# XXX: for the real implementation, see the method of the
# subclass, somewhere under the imdb.parser package.
raise NotImplementedError, 'override this method'
def get_imdbCompanyID(self, companyID):
"""Translate a companyID in a imdbID (the ID used by the IMDb
web server); must be overridden by the subclass."""
# XXX: for the real implementation, see the method of the
# subclass, somewhere under the imdb.parser package.
raise NotImplementedError, 'override this method'
def _searchIMDb(self, kind, ton):
"""Search the IMDb akas server for the given title or name."""
# The Exact Primary search system has gone AWOL, so we resort
# to the mobile search. :-/
if not ton:
return None
aSystem = IMDb('mobile')
if kind == 'tt':
searchFunct = aSystem.search_movie
check = 'long imdb canonical title'
elif kind == 'nm':
searchFunct = aSystem.search_person
check = 'long imdb canonical name'
elif kind == 'char':
searchFunct = aSystem.search_character
check = 'long imdb canonical name'
elif kind == 'co':
# XXX: are [COUNTRY] codes included in the results?
searchFunct = aSystem.search_company
check = 'long imdb name'
try:
searchRes = searchFunct(ton)
except IMDbError:
return None
# When only one result is returned, assume it was from an
# exact match.
if len(searchRes) == 1:
return searchRes[0].getID()
for item in searchRes:
# Return the first perfect match.
if item[check] == ton:
return item.getID()
return None
def title2imdbID(self, title):
"""Translate a movie title (in the plain text data files format)
to an imdbID.
Try an Exact Primary Title search on IMDb;
return None if it's unable to get the imdbID."""
return self._searchIMDb('tt', title)
def name2imdbID(self, name):
"""Translate a person name in an imdbID.
Try an Exact Primary Name search on IMDb;
return None if it's unable to get the imdbID."""
return self._searchIMDb('tt', name)
def character2imdbID(self, name):
"""Translate a character name in an imdbID.
Try an Exact Primary Name search on IMDb;
return None if it's unable to get the imdbID."""
return self._searchIMDb('char', name)
def company2imdbID(self, name):
"""Translate a company name in an imdbID.
Try an Exact Primary Name search on IMDb;
return None if it's unable to get the imdbID."""
return self._searchIMDb('co', name)
def get_imdbID(self, mop):
"""Return the imdbID for the given Movie, Person, Character or Company
object."""
imdbID = None
if mop.accessSystem == self.accessSystem:
aSystem = self
else:
aSystem = IMDb(mop.accessSystem)
if isinstance(mop, Movie.Movie):
if mop.movieID is not None:
imdbID = aSystem.get_imdbMovieID(mop.movieID)
else:
imdbID = aSystem.title2imdbID(build_title(mop, canonical=0,
ptdf=1))
elif isinstance(mop, Person.Person):
if mop.personID is not None:
imdbID = aSystem.get_imdbPersonID(mop.personID)
else:
imdbID = aSystem.name2imdbID(build_name(mop, canonical=1))
elif isinstance(mop, Character.Character):
if mop.characterID is not None:
imdbID = aSystem.get_imdbCharacterID(mop.characterID)
else:
# canonical=0 ?
imdbID = aSystem.character2imdbID(build_name(mop, canonical=1))
elif isinstance(mop, Company.Company):
if mop.companyID is not None:
imdbID = aSystem.get_imdbCompanyID(mop.companyID)
else:
imdbID = aSystem.company2imdbID(build_company_name(mop))
else:
raise IMDbError, 'object ' + repr(mop) + \
' is not a Movie, Person or Character instance'
return imdbID
def get_imdbURL(self, mop):
"""Return the main IMDb URL for the given Movie, Person,
Character or Company object, or None if unable to get it."""
imdbID = self.get_imdbID(mop)
if imdbID is None:
return None
if isinstance(mop, Movie.Movie):
url_firstPart = imdbURL_movie_main
elif isinstance(mop, Person.Person):
url_firstPart = imdbURL_person_main
elif isinstance(mop, Character.Character):
url_firstPart = imdbURL_character_main
elif isinstance(mop, Company.Company):
url_firstPart = imdbURL_company_main
else:
raise IMDbError, 'object ' + repr(mop) + \
' is not a Movie, Person, Character or Company instance'
return url_firstPart % imdbID
def get_special_methods(self):
"""Return the special methods defined by the subclass."""
sm_dict = {}
base_methods = []
for name in dir(IMDbBase):
member = getattr(IMDbBase, name)
if isinstance(member, MethodType):
base_methods.append(name)
for name in dir(self.__class__):
if name.startswith('_') or name in base_methods or \
name.startswith('get_movie_') or \
name.startswith('get_person_') or \
name.startswith('get_company_') or \
name.startswith('get_character_'):
continue
member = getattr(self.__class__, name)
if isinstance(member, MethodType):
sm_dict.update({name: member.__doc__})
return sm_dict
-72
View File
@@ -1,72 +0,0 @@
"""
_compat module (imdb package).
This module provides compatibility functions used by the imdb package
to deal with unusual environments.
Copyright 2008-2010 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
# TODO: now we're heavily using the 'logging' module, which was not
# present in Python 2.2. To work in a Symbian environment, we
# need to create a fake 'logging' module (its functions may call
# the 'warnings' module, or do nothing at all).
import os
# If true, we're working on a Symbian device.
if os.name == 'e32':
# Replace os.path.expandvars and os.path.expanduser, if needed.
def _noact(x):
"""Ad-hoc replacement for IMDbPY."""
return x
try:
os.path.expandvars
except AttributeError:
os.path.expandvars = _noact
try:
os.path.expanduser
except AttributeError:
os.path.expanduser = _noact
# time.strptime is missing, on Symbian devices.
import time
try:
time.strptime
except AttributeError:
import re
_re_web_time = re.compile(r'Episode dated (\d+) (\w+) (\d+)')
_re_ptdf_time = re.compile(r'\((\d+)-(\d+)-(\d+)\)')
_month2digit = {'January': '1', 'February': '2', 'March': '3',
'April': '4', 'May': '5', 'June': '6', 'July': '7',
'August': '8', 'September': '9', 'October': '10',
'November': '11', 'December': '12'}
def strptime(s, format):
"""Ad-hoc strptime replacement for IMDbPY."""
try:
if format.startswith('Episode'):
res = _re_web_time.findall(s)[0]
return (int(res[2]), int(_month2digit[res[1]]), int(res[0]),
0, 0, 0, 0, 1, 0)
else:
res = _re_ptdf_time.findall(s)[0]
return (int(res[0]), int(res[1]), int(res[2]),
0, 0, 0, 0, 1, 0)
except:
raise ValueError, u'error in IMDbPY\'s ad-hoc strptime!'
time.strptime = strptime
-46
View File
@@ -1,46 +0,0 @@
"""
_exceptions module (imdb package).
This module provides the exception hierarchy used by the imdb package.
Copyright 2004-2009 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
import logging
class IMDbError(Exception, object):
"""Base class for every exception raised by the imdb package."""
_logger = logging.getLogger('imdbpy')
def __init__(self, *args, **kwargs):
"""Initialize the exception and pass the message to the log system."""
# Every raised exception also dispatch a critical log.
self._logger.critical('%s exception raised; args: %s; kwds: %s',
self.__class__.__name__, args, kwargs,
exc_info=True)
super(IMDbError, self).__init__(*args, **kwargs)
class IMDbDataAccessError(IMDbError):
"""Exception raised when is not possible to access needed data."""
pass
class IMDbParserError(IMDbError):
"""Exception raised when an error occurred parsing the data."""
pass
-63
View File
@@ -1,63 +0,0 @@
"""
_logging module (imdb package).
This module provides the logging facilities used by the imdb package.
Copyright 2009-2010 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
import logging
LEVELS = {'debug': logging.DEBUG,
'info': logging.INFO,
'warn': logging.WARNING,
'warning': logging.WARNING,
'error': logging.ERROR,
'critical': logging.CRITICAL}
imdbpyLogger = logging.getLogger('imdbpy')
imdbpyStreamHandler = logging.StreamHandler()
imdbpyFormatter = logging.Formatter('%(asctime)s %(levelname)s [%(name)s]' \
' %(pathname)s:%(lineno)d: %(message)s')
imdbpyStreamHandler.setFormatter(imdbpyFormatter)
imdbpyLogger.addHandler(imdbpyStreamHandler)
def setLevel(level):
"""Set logging level for the main logger."""
level = level.lower().strip()
imdbpyLogger.setLevel(LEVELS.get(level, logging.NOTSET))
imdbpyLogger.log(imdbpyLogger.level, 'set logging threshold to "%s"',
logging.getLevelName(imdbpyLogger.level))
#imdbpyLogger.setLevel(logging.DEBUG)
# It can be an idea to have a single function to log and warn:
#import warnings
#def log_and_warn(msg, args=None, logger=None, level=None):
# """Log the message and issue a warning."""
# if logger is None:
# logger = imdbpyLogger
# if level is None:
# level = logging.WARNING
# if args is None:
# args = ()
# #warnings.warn(msg % args, stacklevel=0)
# logger.log(level, msg % args)
-142
View File
@@ -1,142 +0,0 @@
"""
articles module (imdb package).
This module provides functions and data to handle in a smart way
articles (in various languages) at the beginning of movie titles.
Copyright 2009 Davide Alberani <da@erlug.linux.it>
2009 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
# List of generic articles used when the language of the title is unknown (or
# we don't have information about articles in that language).
# XXX: Managing titles in a lot of different languages, a function to recognize
# an initial article can't be perfect; sometimes we'll stumble upon a short
# word that is an article in some language, but it's not in another; in these
# situations we have to choose if we want to interpret this little word
# as an article or not (remember that we don't know what the original language
# of the title was).
# Example: 'en' is (I suppose) an article in Some Language. Unfortunately it
# seems also to be a preposition in other languages (French?).
# Running a script over the whole list of titles (and aliases), I've found
# that 'en' is used as an article only 376 times, and as another thing 594
# times, so I've decided to _always_ consider 'en' as a non article.
#
# Here is a list of words that are _never_ considered as articles, complete
# with the cound of times they are used in a way or another:
# 'en' (376 vs 594), 'to' (399 vs 727), 'as' (198 vs 276), 'et' (79 vs 99),
# 'des' (75 vs 150), 'al' (78 vs 304), 'ye' (14 vs 70),
# 'da' (23 vs 298), "'n" (8 vs 12)
#
# I've left in the list 'i' (1939 vs 2151) and 'uno' (52 vs 56)
# I'm not sure what '-al' is, and so I've left it out...
#
# Generic list of articles in utf-8 encoding:
GENERIC_ARTICLES = ('the', 'la', 'a', 'die', 'der', 'le', 'el',
"l'", 'il', 'das', 'les', 'i', 'o', 'ein', 'un', 'de', 'los',
'an', 'una', 'las', 'eine', 'den', 'het', 'gli', 'lo', 'os',
'ang', 'oi', 'az', 'een', 'ha-', 'det', 'ta', 'al-',
'mga', "un'", 'uno', 'ett', 'dem', 'egy', 'els', 'eines',
'\xc3\x8f', '\xc3\x87', '\xc3\x94\xc3\xaf', '\xc3\x8f\xc3\xa9')
# Lists of articles separated by language. If possible, the list should
# be sorted by frequency (not very important, but...)
# If you want to add a list of articles for another language, mail it
# it at imdbpy-devel@lists.sourceforge.net; non-ascii articles must be utf-8
# encoded.
LANG_ARTICLES = {
'English': ('the', 'a', 'an'),
'Italian': ('la', 'le', "l'", 'il', 'i', 'un', 'una', 'gli', 'lo', "un'",
'uno'),
'Spanish': ('la', 'le', 'el', 'les', 'un', 'los', 'una', 'uno', 'unos',
'unas'),
'Portuguese': ('a', 'as', 'o', 'os', 'um', 'uns', 'uma', 'umas'),
'Turkish': (), # Some languages doesn't have articles.
}
LANG_ARTICLESget = LANG_ARTICLES.get
# Maps a language to countries where it is the main language.
# If you want to add an entry for another language or country, mail it at
# imdbpy-devel@lists.sourceforge.net .
_LANG_COUNTRIES = {
'English': ('USA', 'UK', 'Canada', 'Ireland', 'Australia'),
'Italian': ('Italy',),
'Spanish': ('Spain', 'Mexico'),
'Portuguese': ('Portugal', 'Brazil'),
'Turkish': ('Turkey',),
#'German': ('Germany', 'East Germany', 'West Germany'),
#'French': ('France'),
}
# Maps countries to their main language.
COUNTRY_LANG = {}
for lang in _LANG_COUNTRIES:
for country in _LANG_COUNTRIES[lang]:
COUNTRY_LANG[country] = lang
def toUnicode(articles):
"""Convert a list of articles utf-8 encoded to unicode strings."""
return tuple([art.decode('utf_8') for art in articles])
def toDicts(articles):
"""Given a list of utf-8 encoded articles, build two dictionary (one
utf-8 encoded and another one with unicode keys) for faster matches."""
uArticles = toUnicode(articles)
return dict([(x, x) for x in articles]), dict([(x, x) for x in uArticles])
def addTrailingSpace(articles):
"""From the given list of utf-8 encoded articles, return two
lists (one utf-8 encoded and another one in unicode) where a space
is added at the end - if the last char is not ' or -."""
_spArticles = []
_spUnicodeArticles = []
for article in articles:
if article[-1] not in ("'", '-'):
article += ' '
_spArticles.append(article)
_spUnicodeArticles.append(article.decode('utf_8'))
return _spArticles, _spUnicodeArticles
# Caches.
_ART_CACHE = {}
_SP_ART_CACHE = {}
def articlesDictsForLang(lang):
"""Return dictionaries of articles specific for the given language, or the
default one if the language is not known."""
if lang in _ART_CACHE:
return _ART_CACHE[lang]
artDicts = toDicts(LANG_ARTICLESget(lang, GENERIC_ARTICLES))
_ART_CACHE[lang] = artDicts
return artDicts
def spArticlesForLang(lang):
"""Return lists of articles (plus optional spaces) specific for the
given language, or the default one if the language is not known."""
if lang in _SP_ART_CACHE:
return _SP_ART_CACHE[lang]
spArticles = addTrailingSpace(LANG_ARTICLESget(lang, GENERIC_ARTICLES))
_SP_ART_CACHE[lang] = spArticles
return spArticles
-548
View File
@@ -1,548 +0,0 @@
"""
helpers module (imdb package).
This module provides functions not used directly by the imdb package,
but useful for IMDbPY-based programs.
Copyright 2006-2010 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
# XXX: find better names for the functions in this modules.
import re
from cgi import escape
import gettext
from gettext import gettext as _
gettext.textdomain('imdbpy')
# The modClearRefs can be used to strip names and titles references from
# the strings in Movie and Person objects.
from imdb.utils import modClearRefs, re_titleRef, re_nameRef, \
re_characterRef, _tagAttr, _Container, TAGS_TO_MODIFY
from imdb import IMDb, imdbURL_movie_base, imdbURL_person_base, \
imdbURL_character_base
import imdb.locale
from imdb.Movie import Movie
from imdb.Person import Person
from imdb.Character import Character
from imdb.Company import Company
from imdb.parser.http.utils import re_entcharrefssub, entcharrefs, \
subXMLRefs, subSGMLRefs
from imdb.parser.http.bsouplxml.etree import BeautifulSoup
# An URL, more or less.
_re_href = re.compile(r'(http://.+?)(?=\s|$)', re.I)
_re_hrefsub = _re_href.sub
def makeCgiPrintEncoding(encoding):
"""Make a function to pretty-print strings for the web."""
def cgiPrint(s):
"""Encode the given string using the %s encoding, and replace
chars outside the given charset with XML char references.""" % encoding
s = escape(s, quote=1)
if isinstance(s, unicode):
s = s.encode(encoding, 'xmlcharrefreplace')
return s
return cgiPrint
# cgiPrint uses the latin_1 encoding.
cgiPrint = makeCgiPrintEncoding('latin_1')
# Regular expression for %(varname)s substitutions.
re_subst = re.compile(r'%\((.+?)\)s')
# Regular expression for <if condition>....</if condition> clauses.
re_conditional = re.compile(r'<if\s+(.+?)\s*>(.+?)</if\s+\1\s*>')
def makeTextNotes(replaceTxtNotes):
"""Create a function useful to handle text[::optional_note] values.
replaceTxtNotes is a format string, which can include the following
values: %(text)s and %(notes)s.
Portions of the text can be conditionally excluded, if one of the
values is absent. E.g.: <if notes>[%(notes)s]</if notes> will be replaced
with '[notes]' if notes exists, or by an empty string otherwise.
The returned function is suitable be passed as applyToValues argument
of the makeObject2Txt function."""
def _replacer(s):
outS = replaceTxtNotes
if not isinstance(s, (unicode, str)):
return s
ssplit = s.split('::', 1)
text = ssplit[0]
# Used to keep track of text and note existence.
keysDict = {}
if text:
keysDict['text'] = True
outS = outS.replace('%(text)s', text)
if len(ssplit) == 2:
keysDict['notes'] = True
outS = outS.replace('%(notes)s', ssplit[1])
else:
outS = outS.replace('%(notes)s', u'')
def _excludeFalseConditionals(matchobj):
# Return an empty string if the conditional is false/empty.
if matchobj.group(1) in keysDict:
return matchobj.group(2)
return u''
while re_conditional.search(outS):
outS = re_conditional.sub(_excludeFalseConditionals, outS)
return outS
return _replacer
def makeObject2Txt(movieTxt=None, personTxt=None, characterTxt=None,
companyTxt=None, joiner=' / ',
applyToValues=lambda x: x, _recurse=True):
""""Return a function useful to pretty-print Movie, Person,
Character and Company instances.
*movieTxt* -- how to format a Movie object.
*personTxt* -- how to format a Person object.
*characterTxt* -- how to format a Character object.
*companyTxt* -- how to format a Company object.
*joiner* -- string used to join a list of objects.
*applyToValues* -- function to apply to values.
*_recurse* -- if True (default) manage only the given object.
"""
# Some useful defaults.
if movieTxt is None:
movieTxt = '%(long imdb title)s'
if personTxt is None:
personTxt = '%(long imdb name)s'
if characterTxt is None:
characterTxt = '%(long imdb name)s'
if companyTxt is None:
companyTxt = '%(long imdb name)s'
def object2txt(obj, _limitRecursion=None):
"""Pretty-print objects."""
# Prevent unlimited recursion.
if _limitRecursion is None:
_limitRecursion = 0
elif _limitRecursion > 5:
return u''
_limitRecursion += 1
if isinstance(obj, (list, tuple)):
return joiner.join([object2txt(o, _limitRecursion=_limitRecursion)
for o in obj])
elif isinstance(obj, dict):
# XXX: not exactly nice, neither useful, I fear.
return joiner.join([u'%s::%s' %
(object2txt(k, _limitRecursion=_limitRecursion),
object2txt(v, _limitRecursion=_limitRecursion))
for k, v in obj.items()])
objData = {}
if isinstance(obj, Movie):
objData['movieID'] = obj.movieID
outs = movieTxt
elif isinstance(obj, Person):
objData['personID'] = obj.personID
outs = personTxt
elif isinstance(obj, Character):
objData['characterID'] = obj.characterID
outs = characterTxt
elif isinstance(obj, Company):
objData['companyID'] = obj.companyID
outs = companyTxt
else:
return obj
def _excludeFalseConditionals(matchobj):
# Return an empty string if the conditional is false/empty.
condition = matchobj.group(1)
proceed = obj.get(condition) or getattr(obj, condition, None)
if proceed:
return matchobj.group(2)
else:
return u''
return matchobj.group(2)
while re_conditional.search(outs):
outs = re_conditional.sub(_excludeFalseConditionals, outs)
for key in re_subst.findall(outs):
value = obj.get(key) or getattr(obj, key, None)
if not isinstance(value, (unicode, str)):
if not _recurse:
if value:
value = unicode(value)
if value:
value = object2txt(value, _limitRecursion=_limitRecursion)
elif value:
value = applyToValues(unicode(value))
if not value:
value = u''
elif not isinstance(value, (unicode, str)):
value = unicode(value)
outs = outs.replace(u'%(' + key + u')s', value)
return outs
return object2txt
def makeModCGILinks(movieTxt, personTxt, characterTxt=None,
encoding='latin_1'):
"""Make a function used to pretty-print movies and persons refereces;
movieTxt and personTxt are the strings used for the substitutions.
movieTxt must contains %(movieID)s and %(title)s, while personTxt
must contains %(personID)s and %(name)s and characterTxt %(characterID)s
and %(name)s; characterTxt is optional, for backward compatibility."""
_cgiPrint = makeCgiPrintEncoding(encoding)
def modCGILinks(s, titlesRefs, namesRefs, characterRefs=None):
"""Substitute movies and persons references."""
if characterRefs is None: characterRefs = {}
# XXX: look ma'... more nested scopes! <g>
def _replaceMovie(match):
to_replace = match.group(1)
item = titlesRefs.get(to_replace)
if item:
movieID = item.movieID
to_replace = movieTxt % {'movieID': movieID,
'title': unicode(_cgiPrint(to_replace),
encoding,
'xmlcharrefreplace')}
return to_replace
def _replacePerson(match):
to_replace = match.group(1)
item = namesRefs.get(to_replace)
if item:
personID = item.personID
to_replace = personTxt % {'personID': personID,
'name': unicode(_cgiPrint(to_replace),
encoding,
'xmlcharrefreplace')}
return to_replace
def _replaceCharacter(match):
to_replace = match.group(1)
if characterTxt is None:
return to_replace
item = characterRefs.get(to_replace)
if item:
characterID = item.characterID
if characterID is None:
return to_replace
to_replace = characterTxt % {'characterID': characterID,
'name': unicode(_cgiPrint(to_replace),
encoding,
'xmlcharrefreplace')}
return to_replace
s = s.replace('<', '&lt;').replace('>', '&gt;')
s = _re_hrefsub(r'<a href="\1">\1</a>', s)
s = re_titleRef.sub(_replaceMovie, s)
s = re_nameRef.sub(_replacePerson, s)
s = re_characterRef.sub(_replaceCharacter, s)
return s
modCGILinks.movieTxt = movieTxt
modCGILinks.personTxt = personTxt
modCGILinks.characterTxt = characterTxt
return modCGILinks
# links to the imdb.com web site.
_movieTxt = '<a href="' + imdbURL_movie_base + 'tt%(movieID)s">%(title)s</a>'
_personTxt = '<a href="' + imdbURL_person_base + 'nm%(personID)s">%(name)s</a>'
_characterTxt = '<a href="' + imdbURL_character_base + \
'ch%(characterID)s">%(name)s</a>'
modHtmlLinks = makeModCGILinks(movieTxt=_movieTxt, personTxt=_personTxt,
characterTxt=_characterTxt)
modHtmlLinksASCII = makeModCGILinks(movieTxt=_movieTxt, personTxt=_personTxt,
characterTxt=_characterTxt,
encoding='ascii')
everyentcharrefs = entcharrefs.copy()
for k, v in {'lt':u'<','gt':u'>','amp':u'&','quot':u'"','apos':u'\''}.items():
everyentcharrefs[k] = v
everyentcharrefs['#%s' % ord(v)] = v
everyentcharrefsget = everyentcharrefs.get
re_everyentcharrefs = re.compile('&(%s|\#160|\#\d{1,5});' %
'|'.join(map(re.escape, everyentcharrefs)))
re_everyentcharrefssub = re_everyentcharrefs.sub
def _replAllXMLRef(match):
"""Replace the matched XML reference."""
ref = match.group(1)
value = everyentcharrefsget(ref)
if value is None:
if ref[0] == '#':
return unichr(int(ref[1:]))
else:
return ref
return value
def subXMLHTMLSGMLRefs(s):
"""Return the given string with XML/HTML/SGML entity and char references
replaced."""
return re_everyentcharrefssub(_replAllXMLRef, s)
def sortedSeasons(m):
"""Return a sorted list of seasons of the given series."""
seasons = m.get('episodes', {}).keys()
seasons.sort()
return seasons
def sortedEpisodes(m, season=None):
"""Return a sorted list of episodes of the given series,
considering only the specified season(s) (every season, if None)."""
episodes = []
seasons = season
if season is None:
seasons = sortedSeasons(m)
else:
if not isinstance(season, (tuple, list)):
seasons = [season]
for s in seasons:
eps_indx = m.get('episodes', {}).get(s, {}).keys()
eps_indx.sort()
for e in eps_indx:
episodes.append(m['episodes'][s][e])
return episodes
# Idea and portions of the code courtesy of none none (dclist at gmail.com)
_re_imdbIDurl = re.compile(r'\b(nm|tt|ch|co)([0-9]{7})\b')
def get_byURL(url, info=None, args=None, kwds=None):
"""Return a Movie, Person, Character or Company object for the given URL;
info is the info set to retrieve, args and kwds are respectively a list
and a dictionary or arguments to initialize the data access system.
Returns None if unable to correctly parse the url; can raise
exceptions if unable to retrieve the data."""
if args is None: args = []
if kwds is None: kwds = {}
ia = IMDb(*args, **kwds)
match = _re_imdbIDurl.search(url)
if not match:
return None
imdbtype = match.group(1)
imdbID = match.group(2)
if imdbtype == 'tt':
return ia.get_movie(imdbID, info=info)
elif imdbtype == 'nm':
return ia.get_person(imdbID, info=info)
elif imdbtype == 'ch':
return ia.get_character(imdbID, info=info)
elif imdbtype == 'co':
return ia.get_company(imdbID, info=info)
return None
# Idea and portions of code courtesy of Basil Shubin.
# Beware that these information are now available directly by
# the Movie/Person/Character instances.
def fullSizeCoverURL(obj):
"""Given an URL string or a Movie, Person or Character instance,
returns an URL to the full-size version of the cover/headshot,
or None otherwise. This function is obsolete: the same information
are available as keys: 'full-size cover url' and 'full-size headshot',
respectively for movies and persons/characters."""
if isinstance(obj, Movie):
coverUrl = obj.get('cover url')
elif isinstance(obj, (Person, Character)):
coverUrl = obj.get('headshot')
else:
coverUrl = obj
if not coverUrl:
return None
return _Container._re_fullsizeURL.sub('', coverUrl)
def keyToXML(key):
"""Return a key (the ones used to access information in Movie and
other classes instances) converted to the style of the XML output."""
return _tagAttr(key, '')[0]
def translateKey(key):
"""Translate a given key."""
return _(keyToXML(key))
# Maps tags to classes.
_MAP_TOP_OBJ = {
'person': Person,
'movie': Movie,
'character': Character,
'company': Company
}
# Tags to be converted to lists.
_TAGS_TO_LIST = dict([(x[0], None) for x in TAGS_TO_MODIFY.values()])
_TAGS_TO_LIST.update(_MAP_TOP_OBJ)
def tagToKey(tag):
"""Return the name of the tag, taking it from the 'key' attribute,
if present."""
keyAttr = tag.get('key')
if keyAttr:
if tag.get('keytype') == 'int':
keyAttr = int(keyAttr)
return keyAttr
return tag.name
def _valueWithType(tag, tagValue):
"""Return tagValue, handling some type conversions."""
tagType = tag.get('type')
if tagType == 'int':
tagValue = int(tagValue)
elif tagType == 'float':
tagValue = float(tagValue)
return tagValue
# Extra tags to get (if values were not already read from title/name).
_titleTags = ('imdbindex', 'kind', 'year')
_nameTags = ('imdbindex')
_companyTags = ('imdbindex', 'country')
def parseTags(tag, _topLevel=True, _as=None, _infoset2keys=None,
_key2infoset=None):
"""Recursively parse a tree of tags."""
# The returned object (usually a _Container subclass, but it can
# be a string, an int, a float, a list or a dictionary).
item = None
if _infoset2keys is None:
_infoset2keys = {}
if _key2infoset is None:
_key2infoset = {}
name = tagToKey(tag)
firstChild = tag.find(recursive=False)
tagStr = (tag.string or u'').strip()
if not tagStr and name == 'item':
# Handles 'item' tags containing text and a 'notes' sub-tag.
tagContent = tag.contents[0]
if isinstance(tagContent, BeautifulSoup.NavigableString):
tagStr = (unicode(tagContent) or u'').strip()
tagType = tag.get('type')
infoset = tag.get('infoset')
if infoset:
_key2infoset[name] = infoset
_infoset2keys.setdefault(infoset, []).append(name)
# Here we use tag.name to avoid tags like <item title="company">
if tag.name in _MAP_TOP_OBJ:
# One of the subclasses of _Container.
item = _MAP_TOP_OBJ[name]()
itemAs = tag.get('access-system')
if itemAs:
if not _as:
_as = itemAs
else:
itemAs = _as
item.accessSystem = itemAs
tagsToGet = []
theID = tag.get('id')
if name == 'movie':
item.movieID = theID
tagsToGet = _titleTags
theTitle = tag.find('title', recursive=False)
if tag.title:
item.set_title(tag.title.string)
tag.title.extract()
else:
if name == 'person':
item.personID = theID
tagsToGet = _nameTags
theName = tag.find('long imdb canonical name', recursive=False)
if not theName:
theName = tag.find('name', recursive=False)
elif name == 'character':
item.characterID = theID
tagsToGet = _nameTags
theName = tag.find('name', recursive=False)
elif name == 'company':
item.companyID = theID
tagsToGet = _companyTags
theName = tag.find('name', recursive=False)
if theName:
item.set_name(theName.string)
if theName:
theName.extract()
for t in tagsToGet:
if t in item.data:
continue
dataTag = tag.find(t, recursive=False)
if dataTag:
item.data[tagToKey(dataTag)] = _valueWithType(dataTag,
dataTag.string)
if tag.notes:
item.notes = tag.notes.string
tag.notes.extract()
episodeOf = tag.find('episode-of', recursive=False)
if episodeOf:
item.data['episode of'] = parseTags(episodeOf, _topLevel=False,
_as=_as, _infoset2keys=_infoset2keys,
_key2infoset=_key2infoset)
episodeOf.extract()
cRole = tag.find('current-role', recursive=False)
if cRole:
cr = parseTags(cRole, _topLevel=False, _as=_as,
_infoset2keys=_infoset2keys, _key2infoset=_key2infoset)
item.currentRole = cr
cRole.extract()
# XXX: big assumption, here. What about Movie instances used
# as keys in dictionaries? What about other keys (season and
# episode number, for example?)
if not _topLevel:
#tag.extract()
return item
_adder = lambda key, value: item.data.update({key: value})
elif tagStr:
if tag.notes:
notes = (tag.notes.string or u'').strip()
if notes:
tagStr += u'::%s' % notes
else:
tagStr = _valueWithType(tag, tagStr)
return tagStr
elif firstChild:
firstChildName = tagToKey(firstChild)
if firstChildName in _TAGS_TO_LIST:
item = []
_adder = lambda key, value: item.append(value)
else:
item = {}
_adder = lambda key, value: item.update({key: value})
else:
item = {}
_adder = lambda key, value: item.update({name: value})
for subTag in tag(recursive=False):
subTagKey = tagToKey(subTag)
# Exclude dinamically generated keys.
if tag.name in _MAP_TOP_OBJ and subTagKey in item._additional_keys():
continue
subItem = parseTags(subTag, _topLevel=False, _as=_as,
_infoset2keys=_infoset2keys, _key2infoset=_key2infoset)
if subItem:
_adder(subTagKey, subItem)
if _topLevel and name in _MAP_TOP_OBJ:
# Add information about 'info sets', but only to the top-level object.
item.infoset2keys = _infoset2keys
item.key2infoset = _key2infoset
item.current_info = _infoset2keys.keys()
return item
def parseXML(xml):
"""Parse a XML string, returning an appropriate object (usually an
instance of a subclass of _Container."""
xmlObj = BeautifulSoup.BeautifulStoneSoup(xml,
convertEntities=BeautifulSoup.BeautifulStoneSoup.XHTML_ENTITIES)
if xmlObj:
mainTag = xmlObj.find()
if mainTag:
return parseTags(mainTag)
return None
-29
View File
@@ -1,29 +0,0 @@
"""
locale package (imdb package).
This package provides scripts and files for internationalization
of IMDbPY.
Copyright 2009 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
import gettext
import os
LOCALE_DIR = os.path.dirname(__file__)
gettext.bindtextdomain('imdbpy', LOCALE_DIR)
-78
View File
@@ -1,78 +0,0 @@
#!/usr/bin/env python
"""
generatepot.py script.
This script generates the imdbpy.pot file, from the DTD.
Copyright 2009 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
import re
import sys
from datetime import datetime as dt
DEFAULT_MESSAGES = { }
ELEMENT_PATTERN = r"""<!ELEMENT\s+([^\s]+)"""
re_element = re.compile(ELEMENT_PATTERN)
POT_HEADER_TEMPLATE = r"""# Gettext message file for imdbpy
msgid ""
msgstr ""
"Project-Id-Version: imdbpy\n"
"POT-Creation-Date: %(now)s\n"
"PO-Revision-Date: YYYY-MM-DD HH:MM+0000\n"
"Last-Translator: YOUR NAME <YOUR@EMAIL>\n"
"Language-Team: TEAM NAME <TEAM@EMAIL>\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=UTF-8\n"
"Content-Transfer-Encoding: 8bit\n"
"Plural-Forms: nplurals=1; plural=0;\n"
"Language-Code: en\n"
"Language-Name: English\n"
"Preferred-Encodings: utf-8\n"
"Domain: imdbpy\n"
"""
if len(sys.argv) != 2:
print "Usage: %s dtd_file" % sys.argv[0]
sys.exit()
dtdfilename = sys.argv[1]
dtd = open(dtdfilename).read()
elements = re_element.findall(dtd)
uniq = set(elements)
elements = list(uniq)
print POT_HEADER_TEMPLATE % {
'now': dt.strftime(dt.now(), "%Y-%m-%d %H:%M+0000")
}
for element in sorted(elements):
if element in DEFAULT_MESSAGES:
print '# Default: %s' % DEFAULT_MESSAGES[element]
else:
print '# Default: %s' % element.replace('-', ' ').capitalize()
print 'msgid "%s"' % element
print 'msgstr ""'
# use this part instead of the line above to generate the po file for English
#if element in DEFAULT_MESSAGES:
# print 'msgstr "%s"' % DEFAULT_MESSAGES[element]
#else:
# print 'msgstr "%s"' % element.replace('-', ' ').capitalize()
print
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
-204
View File
@@ -1,204 +0,0 @@
#!/usr/bin/env python
# -*- coding: iso-8859-1 -*-
"""Generate binary message catalog from textual translation description.
This program converts a textual Uniforum-style message catalog (.po file) into
a binary GNU catalog (.mo file). This is essentially the same function as the
GNU msgfmt program, however, it is a simpler implementation.
Usage: msgfmt.py [OPTIONS] filename.po
Options:
-o file
--output-file=file
Specify the output file to write to. If omitted, output will go to a
file named filename.mo (based off the input file name).
-h
--help
Print this message and exit.
-V
--version
Display version information and exit.
Written by Martin v. Löwis <loewis@informatik.hu-berlin.de>,
refactored / fixed by Thomas Waldmann <tw AT waldmann-edv DOT de>.
"""
import sys, os
import getopt, struct, array
__version__ = "1.3"
class SyntaxErrorException(Exception):
"""raised when having trouble parsing the po file content"""
pass
class MsgFmt(object):
"""transform .po -> .mo format"""
def __init__(self):
self.messages = {}
def make_filenames(self, filename, outfile=None):
"""Compute .mo name from .po name or language"""
if filename.endswith('.po'):
infile = filename
else:
infile = filename + '.po'
if outfile is None:
outfile = os.path.splitext(infile)[0] + '.mo'
return infile, outfile
def add(self, id, str, fuzzy):
"""Add a non-fuzzy translation to the dictionary."""
if not fuzzy and str:
self.messages[id] = str
def read_po(self, lines):
ID = 1
STR = 2
section = None
fuzzy = False
line_no = 0
msgid = msgstr = ''
# Parse the catalog
for line in lines:
line_no += 1
# If we get a comment line after a msgstr, this is a new entry
if line.startswith('#') and section == STR:
self.add(msgid, msgstr, fuzzy)
section = None
fuzzy = False
# Record a fuzzy mark
if line.startswith('#,') and 'fuzzy' in line:
fuzzy = True
# Skip comments
if line.startswith('#'):
continue
# Now we are in a msgid section, output previous section
if line.startswith('msgid'):
if section == STR:
self.add(msgid, msgstr, fuzzy)
fuzzy = False
section = ID
line = line[5:]
msgid = msgstr = ''
# Now we are in a msgstr section
elif line.startswith('msgstr'):
section = STR
line = line[6:]
# Skip empty lines
line = line.strip()
if not line:
continue
# XXX: Does this always follow Python escape semantics?
line = eval(line)
if section == ID:
msgid += line
elif section == STR:
msgstr += line
else:
raise SyntaxErrorException('Syntax error on line %d, before:\n%s' % (line_no, line))
# Add last entry
if section == STR:
self.add(msgid, msgstr, fuzzy)
def generate_mo(self):
"""Return the generated output."""
keys = self.messages.keys()
# the keys are sorted in the .mo file
keys.sort()
offsets = []
ids = ''
strs = ''
for id in keys:
# For each string, we need size and file offset. Each string is NUL
# terminated; the NUL does not count into the size.
offsets.append((len(ids), len(id), len(strs), len(self.messages[id])))
ids += id + '\0'
strs += self.messages[id] + '\0'
output = []
# The header is 7 32-bit unsigned integers. We don't use hash tables, so
# the keys start right after the index tables.
# translated string.
keystart = 7*4 + 16*len(keys)
# and the values start after the keys
valuestart = keystart + len(ids)
koffsets = []
voffsets = []
# The string table first has the list of keys, then the list of values.
# Each entry has first the size of the string, then the file offset.
for o1, l1, o2, l2 in offsets:
koffsets += [l1, o1 + keystart]
voffsets += [l2, o2 + valuestart]
offsets = koffsets + voffsets
output.append(struct.pack("Iiiiiii",
0x950412deL, # Magic
0, # Version
len(keys), # # of entries
7*4, # start of key index
7*4 + len(keys)*8, # start of value index
0, 0)) # size and offset of hash table
output.append(array.array("i", offsets).tostring())
output.append(ids)
output.append(strs)
return ''.join(output)
def make(filename, outfile):
mf = MsgFmt()
infile, outfile = mf.make_filenames(filename, outfile)
try:
lines = file(infile).readlines()
except IOError, msg:
print >> sys.stderr, msg
sys.exit(1)
try:
mf.read_po(lines)
output = mf.generate_mo()
except SyntaxErrorException, msg:
print >> sys.stderr, msg
try:
open(outfile, "wb").write(output)
except IOError, msg:
print >> sys.stderr, msg
def usage(code, msg=''):
print >> sys.stderr, __doc__
if msg:
print >> sys.stderr, msg
sys.exit(code)
def main():
try:
opts, args = getopt.getopt(sys.argv[1:], 'hVo:', ['help', 'version', 'output-file='])
except getopt.error, msg:
usage(1, msg)
outfile = None
# parse options
for opt, arg in opts:
if opt in ('-h', '--help'):
usage(0)
elif opt in ('-V', '--version'):
print >> sys.stderr, "msgfmt.py", __version__
sys.exit(0)
elif opt in ('-o', '--output-file'):
outfile = arg
# do it
if not args:
print >> sys.stderr, 'No input file given'
print >> sys.stderr, "Try `msgfmt --help' for more information."
return
for filename in args:
make(filename, outfile)
if __name__ == '__main__':
main()
-49
View File
@@ -1,49 +0,0 @@
#!/usr/bin/env python
"""
rebuildmo.py script.
This script builds the .mo files, from the .po files.
Copyright 2009 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
import glob
import msgfmt
import os
#LOCALE_DIR = os.path.dirname(__file__)
def rebuildmo():
lang_glob = 'imdbpy-*.po'
created = []
for input_file in glob.glob(lang_glob):
lang = input_file[7:-3]
if not os.path.exists(lang):
os.mkdir(lang)
mo_dir = os.path.join(lang, 'LC_MESSAGES')
if not os.path.exists(mo_dir):
os.mkdir(mo_dir)
output_file = os.path.join(mo_dir, 'imdbpy.mo')
msgfmt.make(input_file, output_file)
created.append(lang)
return created
if __name__ == '__main__':
languages = rebuildmo()
print 'Created locale for: %s.' % ' '.join(languages)
-28
View File
@@ -1,28 +0,0 @@
"""
parser package (imdb package).
This package provides various parsers to access IMDb data (e.g.: a
parser for the web/http interface, a parser for the SQL database
interface, etc.).
So far, the http/httpThin, mobile and sql parsers are implemented.
Copyright 2004-2009 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
__all__ = ['http', 'mobile', 'sql']
-775
View File
@@ -1,775 +0,0 @@
"""
parser.http package (imdb package).
This package provides the IMDbHTTPAccessSystem class used to access
IMDb's data through the web interface.
the imdb.IMDb function will return an instance of this class when
called with the 'accessSystem' argument set to "http" or "web"
or "html" (this is the default).
Copyright 2004-2010 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
import sys
import logging
from urllib import FancyURLopener, quote_plus
from codecs import lookup
from imdb import IMDbBase, imdbURL_movie_main, imdbURL_person_main, \
imdbURL_character_main, imdbURL_company_main, \
imdbURL_keyword_main, imdbURL_find, imdbURL_top250, \
imdbURL_bottom100
from imdb.utils import analyze_title
from imdb._exceptions import IMDbDataAccessError, IMDbParserError
import searchMovieParser
import searchPersonParser
import searchCharacterParser
import searchCompanyParser
import searchKeywordParser
import movieParser
import personParser
import characterParser
import companyParser
import topBottomParser
# Logger for miscellaneous functions.
_aux_logger = logging.getLogger('imdbpy.parser.http.aux')
IN_GAE = False
try:
import google.appengine
IN_GAE = True
_aux_logger.info('IMDbPY is running in the Google App Engine environment')
except ImportError:
pass
class _ModuleProxy:
"""A proxy to instantiate and access parsers."""
def __init__(self, module, defaultKeys=None, oldParsers=False,
useModule=None, fallBackToNew=False):
"""Initialize a proxy for the given module; defaultKeys, if set,
muste be a dictionary of values to set for instanced objects."""
if oldParsers or fallBackToNew:
_aux_logger.warn('The old set of parsers was removed; falling ' \
'back to the new parsers.')
self.useModule = useModule
if defaultKeys is None:
defaultKeys = {}
self._defaultKeys = defaultKeys
self._module = module
def __getattr__(self, name):
"""Called only when no look-up is found."""
_sm = self._module
# Read the _OBJECTS dictionary to build the asked parser.
if name in _sm._OBJECTS:
_entry = _sm._OBJECTS[name]
# Initialize the parser.
kwds = {}
if self.useModule:
kwds = {'useModule': self.useModule}
parserClass = _entry[0][0]
obj = parserClass(**kwds)
attrsToSet = self._defaultKeys.copy()
attrsToSet.update(_entry[1] or {})
# Set attribute to the object.
for key in attrsToSet:
setattr(obj, key, attrsToSet[key])
setattr(self, name, obj)
return obj
return getattr(_sm, name)
PY_VERSION = sys.version_info[:2]
# The cookies for the "adult" search.
# Please don't mess with these account.
# Old 'IMDbPY' account.
_old_cookie_id = 'boM2bYxz9MCsOnH9gZ0S9QHs12NWrNdApxsls1Vb5/NGrNdjcHx3dUas10UASoAjVEvhAbGagERgOpNkAPvxdbfKwaV2ikEj9SzXY1WPxABmDKQwdqzwRbM+12NSeJFGUEx3F8as10WwidLzVshDtxaPIbP13NdjVS9UZTYqgTVGrNcT9vyXU1'
_old_cookie_uu = '3M3AXsquTU5Gur/Svik+ewflPm5Rk2ieY3BIPlLjyK3C0Dp9F8UoPgbTyKiGtZp4x1X+uAUGKD7BM2g+dVd8eqEzDErCoYvdcvGLvVLAen1y08hNQtALjVKAe+1hM8g9QbNonlG1/t4S82ieUsBbrSIQbq1yhV6tZ6ArvSbA7rgHc8n5AdReyAmDaJ5Wm/ee3VDoCnGj/LlBs2ieUZNorhHDKK5Q=='
# New 'IMDbPYweb' account.
_cookie_id = 'rH1jNAkjTlNXvHolvBVBsgaPICNZbNdjVjzFwzas9JRmusdjVoqBs/Hs12NR+1WFxEoR9bGKEDUg6sNlADqXwkas12N131Rwdb+UQNGKN8PWrNdjcdqBQVLq8mbGDHP3hqzxhbD692NQi9D0JjpBtRaPIbP1zNdjUOqENQYv1ADWrNcT9vyXU1'
_cookie_uu = 'su4/m8cho4c6HP+W1qgq6wchOmhnF0w+lIWvHjRUPJ6nRA9sccEafjGADJ6hQGrMd4GKqLcz2X4z5+w+M4OIKnRn7FpENH7dxDQu3bQEHyx0ZEyeRFTPHfQEX03XF+yeN1dsPpcXaqjUZAw+lGRfXRQEfz3RIX9IgVEffdBAHw2wQXyf9xdMPrQELw0QNB8dsffsqcdQemjPB0w+moLcPh0JrKrHJ9hjBzdMPpcXTH7XRwwOk='
# imdbpy2010 account.
#_cookie_id = 'QrCdxVi+L+WgqOLrQJJgBgRRXGInphxiBPU/YXSFDyExMFzCp6YcYgSVXyEUhS/xMID8wqemHGID4DlntwZ49vemP5UXsAxiJ4D6goSmHGIgNT9hMXBaRSF2vMS3phxB0bVfQiQlP1RxdrzhB6YcRHFASyIhQVowwXCKtDSlD2YhgRvxBsCKtGemHBKH9mxSI='
#_cookie_uu = 'oiEo2yoJFCA2Zbn/o7Z1LAPIwotAu6QdALv3foDb1x5F/tdrFY63XkSfty4kntS8Y8jkHSDLt3406+d+JThEilPI0mtTaOQdA/t2/iErp22jaLdeVU5ya4PIREpj7HFdpzhEHadcIAngSER50IoHDpD6Bz4Qy3b+UIhE/hBbhz5Q63ceA2hEvhPo5B0FnrL9Q8jkWjDIbA0Au3d+AOtnXoCIRL4Q28c+UOtnXpP4RL4T6OQdA+6ijUCI5B0AW2d+UOtnXpPYRL4T6OQdA8jkTUOYlC0A=='
class _FakeURLOpener(object):
"""Fake URLOpener object, used to return empty strings instead of
errors.
"""
def __init__(self, url, headers):
self.url = url
self.headers = headers
def read(self, *args, **kwds): return ''
def close(self, *args, **kwds): pass
def info(self, *args, **kwds): return self.headers
class IMDbURLopener(FancyURLopener):
"""Fetch web pages and handle errors."""
_logger = logging.getLogger('imdbpy.parser.http.urlopener')
def __init__(self, *args, **kwargs):
self._last_url = u''
FancyURLopener.__init__(self, *args, **kwargs)
# Headers to add to every request.
# XXX: IMDb's web server doesn't like urllib-based programs,
# so lets fake to be Mozilla.
# Wow! I'm shocked by my total lack of ethic! <g>
for header in ('User-Agent', 'User-agent', 'user-agent'):
self.del_header(header)
self.set_header('User-Agent', 'Mozilla/5.0')
# XXX: This class is used also to perform "Exact Primary
# [Title|Name]" searches, and so by default the cookie is set.
c_header = 'id=%s; uu=%s' % (_cookie_id, _cookie_uu)
self.set_header('Cookie', c_header)
def get_proxy(self):
"""Return the used proxy, or an empty string."""
return self.proxies.get('http', '')
def set_proxy(self, proxy):
"""Set the proxy."""
if not proxy:
if self.proxies.has_key('http'):
del self.proxies['http']
else:
if not proxy.lower().startswith('http://'):
proxy = 'http://%s' % proxy
self.proxies['http'] = proxy
def set_header(self, header, value, _overwrite=True):
"""Set a default header."""
if _overwrite:
self.del_header(header)
self.addheaders.append((header, value))
def del_header(self, header):
"""Remove a default header."""
for index in xrange(len(self.addheaders)):
if self.addheaders[index][0] == header:
del self.addheaders[index]
break
def retrieve_unicode(self, url, size=-1):
"""Retrieves the given URL, and returns a unicode string,
trying to guess the encoding of the data (assuming latin_1
by default)"""
encode = None
try:
if size != -1:
self.set_header('Range', 'bytes=0-%d' % size)
uopener = self.open(url)
kwds = {}
if PY_VERSION > (2, 3) and not IN_GAE:
kwds['size'] = size
content = uopener.read(**kwds)
self._last_url = uopener.url
# Maybe the server is so nice to tell us the charset...
server_encode = uopener.info().getparam('charset')
# Otherwise, look at the content-type HTML meta tag.
if server_encode is None and content:
first_bytes = content[:512]
begin_h = first_bytes.find('text/html; charset=')
if begin_h != -1:
end_h = first_bytes[19+begin_h:].find('"')
if end_h != -1:
server_encode = first_bytes[19+begin_h:19+begin_h+end_h]
if server_encode:
try:
if lookup(server_encode):
encode = server_encode
except (LookupError, ValueError, TypeError):
pass
uopener.close()
if size != -1:
self.del_header('Range')
self.close()
except IOError, e:
if size != -1:
# Ensure that the Range header is removed.
self.del_header('Range')
raise IMDbDataAccessError, {'errcode': e.errno,
'errmsg': str(e.strerror),
'url': url,
'proxy': self.get_proxy(),
'exception type': 'IOError',
'original exception': e}
if encode is None:
encode = 'latin_1'
# The detection of the encoding is error prone...
self._logger.warn('Unable to detect the encoding of the retrieved '
'page [%s]; falling back to default latin1.', encode)
##print unicode(content, encode, 'replace').encode('utf8')
return unicode(content, encode, 'replace')
def http_error_default(self, url, fp, errcode, errmsg, headers):
if errcode == 404:
self._logger.warn('404 code returned for %s: %s (headers: %s)',
url, errmsg, headers)
return _FakeURLOpener(url, headers)
raise IMDbDataAccessError, {'url': 'http:%s' % url,
'errcode': errcode,
'errmsg': errmsg,
'headers': headers,
'error type': 'http_error_default',
'proxy': self.get_proxy()}
def open_unknown(self, fullurl, data=None):
raise IMDbDataAccessError, {'fullurl': fullurl,
'data': str(data),
'error type': 'open_unknown',
'proxy': self.get_proxy()}
def open_unknown_proxy(self, proxy, fullurl, data=None):
raise IMDbDataAccessError, {'proxy': str(proxy),
'fullurl': fullurl,
'error type': 'open_unknown_proxy',
'data': str(data)}
class IMDbHTTPAccessSystem(IMDbBase):
"""The class used to access IMDb's data through the web."""
accessSystem = 'http'
_http_logger = logging.getLogger('imdbpy.parser.http')
def __init__(self, isThin=0, adultSearch=1, proxy=-1, oldParsers=False,
fallBackToNew=False, useModule=None, cookie_id=-1,
cookie_uu=None, *arguments, **keywords):
"""Initialize the access system."""
IMDbBase.__init__(self, *arguments, **keywords)
self.urlOpener = IMDbURLopener()
# When isThin is set, we're parsing the "maindetails" page
# of a movie (instead of the "combined" page) and movie/person
# references are not collected if no defaultModFunct is provided.
self.isThin = isThin
self._getRefs = True
self._mdparse = False
if isThin:
if self.accessSystem == 'http':
self.accessSystem = 'httpThin'
self._mdparse = True
if self._defModFunct is None:
self._getRefs = False
from imdb.utils import modNull
self._defModFunct = modNull
self.do_adult_search(adultSearch)
if cookie_id != -1:
if cookie_id is None:
self.del_cookies()
elif cookie_uu is not None:
self.set_cookies(cookie_id, cookie_uu)
if proxy != -1:
self.set_proxy(proxy)
if useModule is not None:
if not isinstance(useModule, (list, tuple)) and ',' in useModule:
useModule = useModule.split(',')
_def = {'_modFunct': self._defModFunct, '_as': self.accessSystem}
# Proxy objects.
self.smProxy = _ModuleProxy(searchMovieParser, defaultKeys=_def,
oldParsers=oldParsers, useModule=useModule,
fallBackToNew=fallBackToNew)
self.spProxy = _ModuleProxy(searchPersonParser, defaultKeys=_def,
oldParsers=oldParsers, useModule=useModule,
fallBackToNew=fallBackToNew)
self.scProxy = _ModuleProxy(searchCharacterParser, defaultKeys=_def,
oldParsers=oldParsers, useModule=useModule,
fallBackToNew=fallBackToNew)
self.scompProxy = _ModuleProxy(searchCompanyParser, defaultKeys=_def,
oldParsers=oldParsers, useModule=useModule,
fallBackToNew=fallBackToNew)
self.skProxy = _ModuleProxy(searchKeywordParser, defaultKeys=_def,
oldParsers=oldParsers, useModule=useModule,
fallBackToNew=fallBackToNew)
self.mProxy = _ModuleProxy(movieParser, defaultKeys=_def,
oldParsers=oldParsers, useModule=useModule,
fallBackToNew=fallBackToNew)
self.pProxy = _ModuleProxy(personParser, defaultKeys=_def,
oldParsers=oldParsers, useModule=useModule,
fallBackToNew=fallBackToNew)
self.cProxy = _ModuleProxy(characterParser, defaultKeys=_def,
oldParsers=oldParsers, useModule=useModule,
fallBackToNew=fallBackToNew)
self.compProxy = _ModuleProxy(companyParser, defaultKeys=_def,
oldParsers=oldParsers, useModule=useModule,
fallBackToNew=fallBackToNew)
self.topBottomProxy = _ModuleProxy(topBottomParser, defaultKeys=_def,
oldParsers=oldParsers, useModule=useModule,
fallBackToNew=fallBackToNew)
def _normalize_movieID(self, movieID):
"""Normalize the given movieID."""
try:
return '%07d' % int(movieID)
except ValueError, e:
raise IMDbParserError, 'invalid movieID "%s": %s' % (movieID, e)
def _normalize_personID(self, personID):
"""Normalize the given personID."""
try:
return '%07d' % int(personID)
except ValueError, e:
raise IMDbParserError, 'invalid personID "%s": %s' % (personID, e)
def _normalize_characterID(self, characterID):
"""Normalize the given characterID."""
try:
return '%07d' % int(characterID)
except ValueError, e:
raise IMDbParserError, 'invalid characterID "%s": %s' % \
(characterID, e)
def _normalize_companyID(self, companyID):
"""Normalize the given companyID."""
try:
return '%07d' % int(companyID)
except ValueError, e:
raise IMDbParserError, 'invalid companyID "%s": %s' % \
(companyID, e)
def get_imdbMovieID(self, movieID):
"""Translate a movieID in an imdbID; in this implementation
the movieID _is_ the imdbID.
"""
return movieID
def get_imdbPersonID(self, personID):
"""Translate a personID in an imdbID; in this implementation
the personID _is_ the imdbID.
"""
return personID
def get_imdbCharacterID(self, characterID):
"""Translate a characterID in an imdbID; in this implementation
the characterID _is_ the imdbID.
"""
return characterID
def get_imdbCompanyID(self, companyID):
"""Translate a companyID in an imdbID; in this implementation
the companyID _is_ the imdbID.
"""
return companyID
def get_proxy(self):
"""Return the used proxy or an empty string."""
return self.urlOpener.get_proxy()
def set_proxy(self, proxy):
"""Set the web proxy to use.
It should be a string like 'http://localhost:8080/'; if the
string is empty, no proxy will be used.
If set, the value of the environment variable HTTP_PROXY is
automatically used.
"""
self.urlOpener.set_proxy(proxy)
def set_cookies(self, cookie_id, cookie_uu):
"""Set a cookie to access an IMDb's account."""
c_header = 'id=%s; uu=%s' % (cookie_id, cookie_uu)
self.urlOpener.set_header('Cookie', c_header)
def del_cookies(self):
"""Remove the used cookie."""
self.urlOpener.del_header('Cookie')
def do_adult_search(self, doAdult,
cookie_id=_cookie_id, cookie_uu=_cookie_uu):
"""If doAdult is true, 'adult' movies are included in the
search results; cookie_id and cookie_uu are optional
parameters to select a specific account (see your cookie
or cookies.txt file."""
if doAdult:
self.set_cookies(cookie_id, cookie_uu)
#c_header = 'id=%s; uu=%s' % (cookie_id, cookie_uu)
#self.urlOpener.set_header('Cookie', c_header)
else:
self.urlOpener.del_header('Cookie')
def _retrieve(self, url, size=-1):
"""Retrieve the given URL."""
##print url
self._http_logger.debug('fetching url %s (size: %d)', url, size)
return self.urlOpener.retrieve_unicode(url, size=size)
def _get_search_content(self, kind, ton, results):
"""Retrieve the web page for a given search.
kind can be 'tt' (for titles), 'nm' (for names),
'char' (for characters) or 'co' (for companies).
ton is the title or the name to search.
results is the maximum number of results to be retrieved."""
if isinstance(ton, unicode):
ton = ton.encode('utf-8')
##params = 'q=%s&%s=on&mx=%s' % (quote_plus(ton), kind, str(results))
params = 's=%s;mx=%s;q=%s' % (kind, str(results), quote_plus(ton))
if kind == 'ep':
params = params.replace('s=ep;', 's=tt;ttype=ep;', 1)
cont = self._retrieve(imdbURL_find % params)
#print 'URL:', imdbURL_find % params
if cont.find('Your search returned more than') == -1 or \
cont.find("displayed the exact matches") == -1:
return cont
# The retrieved page contains no results, because too many
# titles or names contain the string we're looking for.
params = 's=%s;q=%s;lm=0' % (kind, quote_plus(ton))
size = 22528 + results * 512
return self._retrieve(imdbURL_find % params, size=size)
def _search_movie(self, title, results):
# The URL of the query.
# XXX: To retrieve the complete results list:
# params = urllib.urlencode({'more': 'tt', 'q': title})
##params = urllib.urlencode({'tt': 'on','mx': str(results),'q': title})
##params = 'q=%s&tt=on&mx=%s' % (quote_plus(title), str(results))
##cont = self._retrieve(imdbURL_find % params)
cont = self._get_search_content('tt', title, results)
return self.smProxy.search_movie_parser.parse(cont, results=results)['data']
def _search_episode(self, title, results):
t_dict = analyze_title(title)
if t_dict['kind'] == 'episode':
title = t_dict['title']
cont = self._get_search_content('ep', title, results)
return self.smProxy.search_movie_parser.parse(cont, results=results)['data']
def get_movie_main(self, movieID):
if not self.isThin:
cont = self._retrieve(imdbURL_movie_main % movieID + 'combined')
else:
cont = self._retrieve(imdbURL_movie_main % movieID + 'maindetails')
return self.mProxy.movie_parser.parse(cont, mdparse=self._mdparse)
def get_movie_full_credits(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'fullcredits')
return self.mProxy.movie_parser.parse(cont)
def get_movie_plot(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'plotsummary')
return self.mProxy.plot_parser.parse(cont, getRefs=self._getRefs)
def get_movie_awards(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'awards')
return self.mProxy.movie_awards_parser.parse(cont)
def get_movie_taglines(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'taglines')
return self.mProxy.taglines_parser.parse(cont)
def get_movie_keywords(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'keywords')
return self.mProxy.keywords_parser.parse(cont)
def get_movie_alternate_versions(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'alternateversions')
return self.mProxy.alternateversions_parser.parse(cont,
getRefs=self._getRefs)
def get_movie_crazy_credits(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'crazycredits')
return self.mProxy.crazycredits_parser.parse(cont,
getRefs=self._getRefs)
def get_movie_goofs(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'goofs')
return self.mProxy.goofs_parser.parse(cont, getRefs=self._getRefs)
def get_movie_quotes(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'quotes')
return self.mProxy.quotes_parser.parse(cont, getRefs=self._getRefs)
def get_movie_release_dates(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'releaseinfo')
ret = self.mProxy.releasedates_parser.parse(cont)
ret['info sets'] = ('release dates', 'akas')
return ret
get_movie_akas = get_movie_release_dates
def get_movie_vote_details(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'ratings')
return self.mProxy.ratings_parser.parse(cont)
def get_movie_official_sites(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'officialsites')
return self.mProxy.officialsites_parser.parse(cont)
def get_movie_trivia(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'trivia')
return self.mProxy.trivia_parser.parse(cont, getRefs=self._getRefs)
def get_movie_connections(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'movieconnections')
return self.mProxy.connections_parser.parse(cont)
def get_movie_technical(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'technical')
return self.mProxy.tech_parser.parse(cont)
def get_movie_business(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'business')
return self.mProxy.business_parser.parse(cont, getRefs=self._getRefs)
def get_movie_literature(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'literature')
return self.mProxy.literature_parser.parse(cont)
def get_movie_locations(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'locations')
return self.mProxy.locations_parser.parse(cont)
def get_movie_soundtrack(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'soundtrack')
return self.mProxy.soundtrack_parser.parse(cont)
def get_movie_dvd(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'dvd')
return self.mProxy.dvd_parser.parse(cont, getRefs=self._getRefs)
def get_movie_recommendations(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'recommendations')
return self.mProxy.rec_parser.parse(cont)
def get_movie_external_reviews(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'externalreviews')
return self.mProxy.externalrev_parser.parse(cont)
def get_movie_newsgroup_reviews(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'newsgroupreviews')
return self.mProxy.newsgrouprev_parser.parse(cont)
def get_movie_misc_sites(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'miscsites')
return self.mProxy.misclinks_parser.parse(cont)
def get_movie_sound_clips(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'soundsites')
return self.mProxy.soundclips_parser.parse(cont)
def get_movie_video_clips(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'videosites')
return self.mProxy.videoclips_parser.parse(cont)
def get_movie_photo_sites(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'photosites')
return self.mProxy.photosites_parser.parse(cont)
def get_movie_news(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'news')
return self.mProxy.news_parser.parse(cont, getRefs=self._getRefs)
def get_movie_amazon_reviews(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'amazon')
return self.mProxy.amazonrev_parser.parse(cont)
def get_movie_guests(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'epcast')
return self.mProxy.episodes_cast_parser.parse(cont)
get_movie_episodes_cast = get_movie_guests
def get_movie_merchandising_links(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'sales')
return self.mProxy.sales_parser.parse(cont)
def get_movie_episodes(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'episodes')
data_d = self.mProxy.episodes_parser.parse(cont)
# set movie['episode of'].movieID for every episode of the series.
if data_d.get('data', {}).has_key('episodes'):
nr_eps = 0
for season in data_d['data']['episodes'].values():
for episode in season.values():
episode['episode of'].movieID = movieID
nr_eps += 1
# Number of episodes.
if nr_eps:
data_d['data']['number of episodes'] = nr_eps
return data_d
def get_movie_episodes_rating(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'epdate')
data_d = self.mProxy.eprating_parser.parse(cont)
# set movie['episode of'].movieID for every episode.
if data_d.get('data', {}).has_key('episodes rating'):
for item in data_d['data']['episodes rating']:
episode = item['episode']
episode['episode of'].movieID = movieID
return data_d
def get_movie_faqs(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'faq')
return self.mProxy.movie_faqs_parser.parse(cont, getRefs=self._getRefs)
def get_movie_airing(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'tvschedule')
return self.mProxy.airing_parser.parse(cont)
get_movie_tv_schedule = get_movie_airing
def get_movie_synopsis(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'synopsis')
return self.mProxy.synopsis_parser.parse(cont)
def get_movie_parents_guide(self, movieID):
cont = self._retrieve(imdbURL_movie_main % movieID + 'parentalguide')
return self.mProxy.parentsguide_parser.parse(cont)
def _search_person(self, name, results):
# The URL of the query.
# XXX: To retrieve the complete results list:
# params = urllib.urlencode({'more': 'nm', 'q': name})
##params = urllib.urlencode({'nm': 'on', 'mx': str(results), 'q': name})
#params = 'q=%s&nm=on&mx=%s' % (quote_plus(name), str(results))
#cont = self._retrieve(imdbURL_find % params)
cont = self._get_search_content('nm', name, results)
return self.spProxy.search_person_parser.parse(cont, results=results)['data']
def get_person_main(self, personID):
cont = self._retrieve(imdbURL_person_main % personID + 'maindetails')
ret = self.pProxy.maindetails_parser.parse(cont)
ret['info sets'] = ('main', 'filmography')
return ret
def get_person_filmography(self, personID):
return self.get_person_main(personID)
def get_person_biography(self, personID):
cont = self._retrieve(imdbURL_person_main % personID + 'bio')
return self.pProxy.bio_parser.parse(cont, getRefs=self._getRefs)
def get_person_awards(self, personID):
cont = self._retrieve(imdbURL_person_main % personID + 'awards')
return self.pProxy.person_awards_parser.parse(cont)
def get_person_other_works(self, personID):
cont = self._retrieve(imdbURL_person_main % personID + 'otherworks')
return self.pProxy.otherworks_parser.parse(cont, getRefs=self._getRefs)
#def get_person_agent(self, personID):
# cont = self._retrieve(imdbURL_person_main % personID + 'agent')
# return self.pProxy.agent_parser.parse(cont)
def get_person_publicity(self, personID):
cont = self._retrieve(imdbURL_person_main % personID + 'publicity')
return self.pProxy.publicity_parser.parse(cont)
def get_person_official_sites(self, personID):
cont = self._retrieve(imdbURL_person_main % personID + 'officialsites')
return self.pProxy.person_officialsites_parser.parse(cont)
def get_person_news(self, personID):
cont = self._retrieve(imdbURL_person_main % personID + 'news')
return self.pProxy.news_parser.parse(cont)
def get_person_episodes(self, personID):
cont = self._retrieve(imdbURL_person_main % personID + 'filmoseries')
return self.pProxy.person_series_parser.parse(cont)
def get_person_merchandising_links(self, personID):
cont = self._retrieve(imdbURL_person_main % personID + 'forsale')
return self.pProxy.sales_parser.parse(cont)
def get_person_genres_links(self, personID):
cont = self._retrieve(imdbURL_person_main % personID + 'filmogenre')
return self.pProxy.person_genres_parser.parse(cont)
def get_person_keywords_links(self, personID):
cont = self._retrieve(imdbURL_person_main % personID + 'filmokey')
return self.pProxy.person_keywords_parser.parse(cont)
def _search_character(self, name, results):
cont = self._get_search_content('char', name, results)
return self.scProxy.search_character_parser.parse(cont, results=results)['data']
def get_character_main(self, characterID):
cont = self._retrieve(imdbURL_character_main % characterID)
ret = self.cProxy.character_main_parser.parse(cont)
ret['info sets'] = ('main', 'filmography')
return ret
get_character_filmography = get_character_main
def get_character_biography(self, characterID):
cont = self._retrieve(imdbURL_character_main % characterID + 'bio')
return self.cProxy.character_bio_parser.parse(cont,
getRefs=self._getRefs)
def get_character_episodes(self, characterID):
cont = self._retrieve(imdbURL_character_main % characterID +
'filmoseries')
return self.cProxy.character_series_parser.parse(cont)
def get_character_quotes(self, characterID):
cont = self._retrieve(imdbURL_character_main % characterID + 'quotes')
return self.cProxy.character_quotes_parser.parse(cont,
getRefs=self._getRefs)
def _search_company(self, name, results):
cont = self._get_search_content('co', name, results)
url = self.urlOpener._last_url
return self.scompProxy.search_company_parser.parse(cont, url=url,
results=results)['data']
def get_company_main(self, companyID):
cont = self._retrieve(imdbURL_company_main % companyID)
ret = self.compProxy.company_main_parser.parse(cont)
return ret
def _search_keyword(self, keyword, results):
# XXX: the IMDb web server seems to have some serious problem with
# non-ascii keyword.
# E.g.: http://akas.imdb.com/keyword/fianc%E9/
# will return a 500 Internal Server Error: Redirect Recursion.
keyword = keyword.encode('utf8', 'ignore')
try:
cont = self._get_search_content('kw', keyword, results)
except IMDbDataAccessError:
self._http_logger.warn('unable to search for keyword %s', keyword,
exc_info=True)
return []
return self.skProxy.search_keyword_parser.parse(cont, results=results)['data']
def _get_keyword(self, keyword, results):
keyword = keyword.encode('utf8', 'ignore')
try:
cont = self._retrieve(imdbURL_keyword_main % keyword)
except IMDbDataAccessError:
self._http_logger.warn('unable to get keyword %s', keyword,
exc_info=True)
return []
return self.skProxy.search_moviekeyword_parser.parse(cont, results=results)['data']
def _get_top_bottom_movies(self, kind):
if kind == 'top':
parser = self.topBottomProxy.top250_parser
url = imdbURL_top250
elif kind == 'bottom':
parser = self.topBottomProxy.bottom100_parser
url = imdbURL_bottom100
else:
return []
cont = self._retrieve(url)
return parser.parse(cont)['data']
File diff suppressed because it is too large Load Diff
@@ -1,394 +0,0 @@
"""
parser.http.bsoupxpath module (imdb.parser.http package).
This module provides XPath support for BeautifulSoup.
Copyright 2008 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
__author__ = 'H. Turgut Uyar <uyar@tekir.org>'
__docformat__ = 'restructuredtext'
import re
import string
import _bsoup as BeautifulSoup
# XPath related enumerations and constants
AXIS_ANCESTOR = 'ancestor'
AXIS_ATTRIBUTE = 'attribute'
AXIS_CHILD = 'child'
AXIS_DESCENDANT = 'descendant'
AXIS_FOLLOWING = 'following'
AXIS_FOLLOWING_SIBLING = 'following-sibling'
AXIS_PRECEDING_SIBLING = 'preceding-sibling'
AXES = (AXIS_ANCESTOR, AXIS_ATTRIBUTE, AXIS_CHILD, AXIS_DESCENDANT,
AXIS_FOLLOWING, AXIS_FOLLOWING_SIBLING, AXIS_PRECEDING_SIBLING)
XPATH_FUNCTIONS = ('starts-with', 'string-length')
def tokenize_path(path):
"""Tokenize a location path into location steps. Return the list of steps.
If two steps are separated by a double slash, the double slashes are part of
the second step. If they are separated by only one slash, the slash is not
included in any of the steps.
"""
# form a list of tuples that mark the start and end positions of steps
separators = []
last_position = 0
i = -1
in_string = False
while i < len(path) - 1:
i = i + 1
if path[i] == "'":
in_string = not in_string
if in_string:
# slashes within strings are not step separators
continue
if path[i] == '/':
if i > 0:
separators.append((last_position, i))
if (path[i+1] == '/'):
last_position = i
i = i + 1
else:
last_position = i + 1
separators.append((last_position, len(path)))
steps = []
for start, end in separators:
steps.append(path[start:end])
return steps
class Path:
"""A location path.
"""
def __init__(self, path, parse=True):
self.path = path
self.steps = []
if parse:
if (path[0] == '/') and (path[1] != '/'):
# if not on the descendant axis, remove the leading slash
path = path[1:]
steps = tokenize_path(path)
for step in steps:
self.steps.append(PathStep(step))
def apply(self, node):
"""Apply the path to a node. Return the resulting list of nodes.
Apply the steps in the path sequentially by sending the output of each
step as input to the next step.
"""
# FIXME: this should return a node SET, not a node LIST
# or at least a list with no duplicates
if self.path[0] == '/':
# for an absolute path, start from the root
if not isinstance(node, BeautifulSoup.Tag) \
or (node.name != '[document]'):
node = node.findParent('[document]')
nodes = [node]
for step in self.steps:
nodes = step.apply(nodes)
return nodes
class PathStep:
"""A location step in a location path.
"""
AXIS_PATTERN = r"""(%s)::|@""" % '|'.join(AXES)
NODE_TEST_PATTERN = r"""\w+(\(\))?"""
PREDICATE_PATTERN = r"""\[(.*?)\]"""
LOCATION_STEP_PATTERN = r"""(%s)?(%s)((%s)*)""" \
% (AXIS_PATTERN, NODE_TEST_PATTERN, PREDICATE_PATTERN)
_re_location_step = re.compile(LOCATION_STEP_PATTERN)
PREDICATE_NOT_PATTERN = r"""not\((.*?)\)"""
PREDICATE_AXIS_PATTERN = r"""(%s)?(%s)(='(.*?)')?""" \
% (AXIS_PATTERN, NODE_TEST_PATTERN)
PREDICATE_FUNCTION_PATTERN = r"""(%s)\(([^,]+(,\s*[^,]+)*)?\)(=(.*))?""" \
% '|'.join(XPATH_FUNCTIONS)
_re_predicate_not = re.compile(PREDICATE_NOT_PATTERN)
_re_predicate_axis = re.compile(PREDICATE_AXIS_PATTERN)
_re_predicate_function = re.compile(PREDICATE_FUNCTION_PATTERN)
def __init__(self, step):
self.step = step
if (step == '.') or (step == '..'):
return
if step[:2] == '//':
default_axis = AXIS_DESCENDANT
step = step[2:]
else:
default_axis = AXIS_CHILD
step_match = self._re_location_step.match(step)
# determine the axis
axis = step_match.group(1)
if axis is None:
self.axis = default_axis
elif axis == '@':
self.axis = AXIS_ATTRIBUTE
else:
self.axis = step_match.group(2)
self.soup_args = {}
self.index = None
self.node_test = step_match.group(3)
if self.node_test == 'text()':
self.soup_args['text'] = True
else:
self.soup_args['name'] = self.node_test
self.checkers = []
predicates = step_match.group(5)
if predicates is not None:
predicates = [p for p in predicates[1:-1].split('][') if p]
for predicate in predicates:
checker = self.__parse_predicate(predicate)
if checker is not None:
self.checkers.append(checker)
def __parse_predicate(self, predicate):
"""Parse the predicate. Return a callable that can be used to filter
nodes. Update `self.soup_args` to take advantage of BeautifulSoup search
features.
"""
try:
position = int(predicate)
if self.axis == AXIS_DESCENDANT:
return PredicateFilter('position', value=position)
else:
# use the search limit feature instead of a checker
self.soup_args['limit'] = position
self.index = position - 1
return None
except ValueError:
pass
if predicate == "last()":
self.index = -1
return None
negate = self._re_predicate_not.match(predicate)
if negate:
predicate = negate.group(1)
function_match = self._re_predicate_function.match(predicate)
if function_match:
name = function_match.group(1)
arguments = function_match.group(2)
value = function_match.group(4)
if value is not None:
value = function_match.group(5)
return PredicateFilter(name, arguments, value)
axis_match = self._re_predicate_axis.match(predicate)
if axis_match:
axis = axis_match.group(1)
if axis is None:
axis = AXIS_CHILD
elif axis == '@':
axis = AXIS_ATTRIBUTE
if axis == AXIS_ATTRIBUTE:
# use the attribute search feature instead of a checker
attribute_name = axis_match.group(3)
if axis_match.group(5) is not None:
attribute_value = axis_match.group(6)
elif not negate:
attribute_value = True
else:
attribute_value = None
if not self.soup_args.has_key('attrs'):
self.soup_args['attrs'] = {}
self.soup_args['attrs'][attribute_name] = attribute_value
return None
elif axis == AXIS_CHILD:
node_test = axis_match.group(3)
node_value = axis_match.group(6)
return PredicateFilter('axis', node_test, value=node_value,
negate=negate)
raise NotImplementedError("This predicate is not implemented")
def apply(self, nodes):
"""Apply the step to a list of nodes. Return the list of nodes for the
next step.
"""
if self.step == '.':
return nodes
elif self.step == '..':
return [node.parent for node in nodes]
result = []
for node in nodes:
if self.axis == AXIS_CHILD:
found = node.findAll(recursive=False, **self.soup_args)
elif self.axis == AXIS_DESCENDANT:
found = node.findAll(recursive=True, **self.soup_args)
elif self.axis == AXIS_ATTRIBUTE:
try:
found = [node[self.node_test]]
except KeyError:
found = []
elif self.axis == AXIS_FOLLOWING_SIBLING:
found = node.findNextSiblings(**self.soup_args)
elif self.axis == AXIS_PRECEDING_SIBLING:
# TODO: make sure that the result is reverse ordered
found = node.findPreviousSiblings(**self.soup_args)
elif self.axis == AXIS_FOLLOWING:
# find the last descendant of this node
last = node
while (not isinstance(last, BeautifulSoup.NavigableString)) \
and (len(last.contents) > 0):
last = last.contents[-1]
found = last.findAllNext(**self.soup_args)
elif self.axis == AXIS_ANCESTOR:
found = node.findParents(**self.soup_args)
# this should only be active if there is a position predicate
# and the axis is not 'descendant'
if self.index is not None:
if found:
if len(found) > self.index:
found = [found[self.index]]
else:
found = []
if found:
for checker in self.checkers:
found = filter(checker, found)
result.extend(found)
return result
class PredicateFilter:
"""A callable class for filtering nodes.
"""
def __init__(self, name, arguments=None, value=None, negate=False):
self.name = name
self.arguments = arguments
self.negate = negate
if name == 'position':
self.__filter = self.__position
self.value = value
elif name == 'axis':
self.__filter = self.__axis
self.node_test = arguments
self.value = value
elif name == 'starts-with':
self.__filter = self.__starts_with
args = map(string.strip, arguments.split(','))
if args[0][0] == '@':
self.arguments = (True, args[0][1:], args[1][1:-1])
else:
self.arguments = (False, args[0], args[1][1:-1])
elif name == 'string-length':
self.__filter = self.__string_length
args = map(string.strip, arguments.split(','))
if args[0][0] == '@':
self.arguments = (True, args[0][1:])
else:
self.arguments = (False, args[0])
self.value = int(value)
else:
raise NotImplementedError("This XPath function is not implemented")
def __call__(self, node):
if self.negate:
return not self.__filter(node)
else:
return self.__filter(node)
def __position(self, node):
if isinstance(node, BeautifulSoup.NavigableString):
actual_position = len(node.findPreviousSiblings(text=True)) + 1
else:
actual_position = len(node.findPreviousSiblings(node.name)) + 1
return actual_position == self.value
def __axis(self, node):
if self.node_test == 'text()':
return node.string == self.value
else:
children = node.findAll(self.node_test, recursive=False)
if len(children) > 0 and self.value is None:
return True
for child in children:
if child.string == self.value:
return True
return False
def __starts_with(self, node):
if self.arguments[0]:
# this is an attribute
attribute_name = self.arguments[1]
if node.has_key(attribute_name):
first = node[attribute_name]
return first.startswith(self.arguments[2])
elif self.arguments[1] == 'text()':
first = node.contents[0]
if isinstance(first, BeautifulSoup.NavigableString):
return first.startswith(self.arguments[2])
return False
def __string_length(self, node):
if self.arguments[0]:
# this is an attribute
attribute_name = self.arguments[1]
if node.has_key(attribute_name):
value = node[attribute_name]
else:
value = None
elif self.arguments[1] == 'text()':
value = node.string
if value is not None:
return len(value) == self.value
return False
_paths = {}
_steps = {}
def get_path(path):
"""Utility for eliminating repeated parsings of the same paths and steps.
"""
if not _paths.has_key(path):
p = Path(path, parse=False)
steps = tokenize_path(path)
for step in steps:
if not _steps.has_key(step):
_steps[step] = PathStep(step)
p.steps.append(_steps[step])
_paths[path] = p
return _paths[path]
-75
View File
@@ -1,75 +0,0 @@
"""
parser.http.bsouplxml.etree module (imdb.parser.http package).
This module adapts the beautifulsoup interface to lxml.etree module.
Copyright 2008 H. Turgut Uyar <uyar@tekir.org>
2008 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
import _bsoup as BeautifulSoup
from _bsoup import Tag as Element
import bsoupxpath
# Not directly used by IMDbPY, but do not remove: it's used by IMDbPYKit,
# for example.
def fromstring(xml_string):
"""Return a DOM representation of the string."""
# We try to not use BeautifulSoup.BeautifulStoneSoup.XML_ENTITIES,
# for convertEntities.
return BeautifulSoup.BeautifulStoneSoup(xml_string,
convertEntities=None).findChild(True)
def tostring(element, encoding=None, pretty_print=False):
"""Return a string or unicode representation of an element."""
if encoding is unicode:
encoding = None
# For BeautifulSoup 3.1
#encArgs = {'prettyPrint': pretty_print}
#if encoding is not None:
# encArgs['encoding'] = encoding
#return element.encode(**encArgs)
return element.__str__(encoding, pretty_print)
def setattribute(tag, name, value):
tag[name] = value
def xpath(node, expr):
"""Apply an xpath expression to a node. Return a list of nodes."""
#path = bsoupxpath.Path(expr)
path = bsoupxpath.get_path(expr)
return path.apply(node)
# XXX: monkey patching the beautifulsoup tag class
class _EverythingIsNestable(dict):
""""Fake that every tag is nestable."""
def get(self, key, *args, **kwds):
return []
BeautifulSoup.BeautifulStoneSoup.NESTABLE_TAGS = _EverythingIsNestable()
BeautifulSoup.Tag.tag = property(fget=lambda self: self.name)
BeautifulSoup.Tag.attrib = property(fget=lambda self: self)
BeautifulSoup.Tag.text = property(fget=lambda self: self.string)
BeautifulSoup.Tag.set = setattribute
BeautifulSoup.Tag.getparent = lambda self: self.parent
BeautifulSoup.Tag.drop_tree = BeautifulSoup.Tag.extract
BeautifulSoup.Tag.xpath = xpath
# TODO: setting the text attribute for tags
-31
View File
@@ -1,31 +0,0 @@
"""
parser.http.bsouplxml.html module (imdb.parser.http package).
This module adapts the beautifulsoup interface to lxml.html module.
Copyright 2008 H. Turgut Uyar <uyar@tekir.org>
2008 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
import _bsoup as BeautifulSoup
def fromstring(html_string):
"""Return a DOM representation of the string."""
return BeautifulSoup.BeautifulSoup(html_string,
convertEntities=BeautifulSoup.BeautifulSoup.HTML_ENTITIES
).findChild(True)
-203
View File
@@ -1,203 +0,0 @@
"""
parser.http.characterParser module (imdb package).
This module provides the classes (and the instances), used to parse
the IMDb pages on the akas.imdb.com server about a character.
E.g., for "Jesse James" the referred pages would be:
main details: http://www.imdb.com/character/ch0000001/
biography: http://www.imdb.com/character/ch0000001/bio
...and so on...
Copyright 2007-2009 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
import re
from utils import Attribute, Extractor, DOMParserBase, build_movie, \
analyze_imdbid
from personParser import DOMHTMLMaindetailsParser
from imdb.Movie import Movie
_personIDs = re.compile(r'/name/nm([0-9]{7})')
class DOMHTMLCharacterMaindetailsParser(DOMHTMLMaindetailsParser):
"""Parser for the "filmography" page of a given character.
The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a
dictionary, with a key for every relevant section.
Example:
bparser = DOMHTMLCharacterMaindetailsParser()
result = bparser.parse(character_biography_html_string)
"""
_containsObjects = True
_film_attrs = [Attribute(key=None,
multi=True,
path={
'link': "./a[1]/@href",
'title': ".//text()",
'status': "./i/a//text()",
'roleID': "./a/@href"
},
postprocess=lambda x:
build_movie(x.get('title') or u'',
movieID=analyze_imdbid(x.get('link') or u''),
roleID=_personIDs.findall(x.get('roleID') or u''),
status=x.get('status') or None,
_parsingCharacter=True))]
extractors = [
Extractor(label='title',
path="//title",
attrs=Attribute(key='name',
path="./text()",
postprocess=lambda x: \
x.replace(' (Character)', '').replace(
'- Filmography by type', '').strip())),
Extractor(label='headshot',
path="//a[@name='headshot']",
attrs=Attribute(key='headshot',
path="./img/@src")),
Extractor(label='akas',
path="//div[h5='Alternate Names:']",
attrs=Attribute(key='akas',
path="./div//text()",
postprocess=lambda x: x.strip().split(' / '))),
Extractor(label='filmography',
path="//div[@class='filmo'][not(h5)]/ol/li",
attrs=_film_attrs),
Extractor(label='filmography sections',
group="//div[@class='filmo'][h5]",
group_key="./h5/a/text()",
group_key_normalize=lambda x: x.lower()[:-1],
path="./ol/li",
attrs=_film_attrs),
]
preprocessors = [
# Check that this doesn't cut "status"...
(re.compile(r'<br>(\.\.\.| ).+?</li>', re.I | re.M), '</li>')]
class DOMHTMLCharacterBioParser(DOMParserBase):
"""Parser for the "biography" page of a given character.
The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a
dictionary, with a key for every relevant section.
Example:
bparser = DOMHTMLCharacterBioParser()
result = bparser.parse(character_biography_html_string)
"""
_defGetRefs = True
extractors = [
Extractor(label='introduction',
path="//div[@id='_intro']",
attrs=Attribute(key='introduction',
path=".//text()",
postprocess=lambda x: x.strip())),
Extractor(label='biography',
path="//span[@class='_biography']",
attrs=Attribute(key='biography',
multi=True,
path={
'info': "./preceding-sibling::h4[1]//text()",
'text': ".//text()"
},
postprocess=lambda x: u'%s: %s' % (
x.get('info').strip(),
x.get('text').replace('\n',
' ').replace('||', '\n\n').strip()))),
]
preprocessors = [
(re.compile('(<div id="swiki.2.3.1">)', re.I), r'\1<div id="_intro">'),
(re.compile('(<a name="history">)\s*(<table .*?</table>)',
re.I | re.DOTALL),
r'</div>\2\1</a>'),
(re.compile('(<a name="[^"]+">)(<h4>)', re.I), r'</span>\1</a>\2'),
(re.compile('(</h4>)</a>', re.I), r'\1<span class="_biography">'),
(re.compile('<br/><br/>', re.I), r'||'),
(re.compile('\|\|\n', re.I), r'</span>'),
]
class DOMHTMLCharacterQuotesParser(DOMParserBase):
"""Parser for the "quotes" page of a given character.
The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a
dictionary, with a key for every relevant section.
Example:
qparser = DOMHTMLCharacterQuotesParser()
result = qparser.parse(character_quotes_html_string)
"""
_defGetRefs = True
extractors = [
Extractor(label='charquotes',
group="//h5",
group_key="./a/text()",
path="./following-sibling::div[1]",
attrs=Attribute(key=None,
path={'txt': ".//text()",
'movieID': ".//a[1]/@href"},
postprocess=lambda x: (analyze_imdbid(x['movieID']),
x['txt'].strip().replace(': ',
': ').replace(': ', ': ').split('||'))))
]
preprocessors = [
(re.compile('(</h5>)', re.I), r'\1<div>'),
(re.compile('\s*<br/><br/>\s*', re.I), r'||'),
(re.compile('\|\|\s*(<hr/>)', re.I), r'</div>\1'),
(re.compile('\s*<br/>\s*', re.I), r'::')
]
def postprocess_data(self, data):
if not data:
return {}
newData = {}
for title in data:
movieID, quotes = data[title]
if movieID is None:
movie = title
else:
movie = Movie(title=title, movieID=movieID,
accessSystem=self._as, modFunct=self._modFunct)
newData[movie] = [quote.split('::') for quote in quotes]
return {'quotes': newData}
from personParser import DOMHTMLSeriesParser
_OBJECTS = {
'character_main_parser': ((DOMHTMLCharacterMaindetailsParser,),
{'kind': 'character'}),
'character_series_parser': ((DOMHTMLSeriesParser,), None),
'character_bio_parser': ((DOMHTMLCharacterBioParser,), None),
'character_quotes_parser': ((DOMHTMLCharacterQuotesParser,), None)
}
-91
View File
@@ -1,91 +0,0 @@
"""
parser.http.companyParser module (imdb package).
This module provides the classes (and the instances), used to parse
the IMDb pages on the akas.imdb.com server about a company.
E.g., for "Columbia Pictures [us]" the referred page would be:
main details: http://akas.imdb.com/company/co0071509/
Copyright 2008-2009 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
import re
from utils import build_movie, Attribute, Extractor, DOMParserBase, \
analyze_imdbid
from imdb.utils import analyze_company_name
class DOMCompanyParser(DOMParserBase):
"""Parser for the main page of a given company.
The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a
dictionary, with a key for every relevant section.
Example:
cparser = DOMCompanyParser()
result = cparser.parse(company_html_string)
"""
_containsObjects = True
extractors = [
Extractor(label='name',
path="//title",
attrs=Attribute(key='name',
path="./text()",
postprocess=lambda x: \
analyze_company_name(x, stripNotes=True))),
Extractor(label='filmography',
group="//b/a[@name]",
group_key="./text()",
group_key_normalize=lambda x: x.lower(),
path="../following-sibling::ol[1]/li",
attrs=Attribute(key=None,
multi=True,
path={
'link': "./a[1]/@href",
'title': "./a[1]/text()",
'year': "./text()[1]"
},
postprocess=lambda x:
build_movie(u'%s %s' % \
(x.get('title'), x.get('year').strip()),
movieID=analyze_imdbid(x.get('link') or u''),
_parsingCompany=True))),
]
preprocessors = [
(re.compile('(<b><a name=)', re.I), r'</p>\1')
]
def postprocess_data(self, data):
for key in data.keys():
new_key = key.replace('company', 'companies')
new_key = new_key.replace('other', 'miscellaneous')
new_key = new_key.replace('distributor', 'distributors')
if new_key != key:
data[new_key] = data[key]
del data[key]
return data
_OBJECTS = {
'company_main_parser': ((DOMCompanyParser,), None)
}
File diff suppressed because it is too large Load Diff
-559
View File
@@ -1,559 +0,0 @@
"""
parser.http.personParser module (imdb package).
This module provides the classes (and the instances), used to parse
the IMDb pages on the akas.imdb.com server about a person.
E.g., for "Mel Gibson" the referred pages would be:
categorized: http://akas.imdb.com/name/nm0000154/maindetails
biography: http://akas.imdb.com/name/nm0000154/bio
...and so on...
Copyright 2004-2010 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
import re
from imdb.Movie import Movie
from imdb.utils import analyze_name, canonicalName, normalizeName, \
analyze_title, date_and_notes
from utils import build_movie, DOMParserBase, Attribute, Extractor, \
analyze_imdbid
from movieParser import _manageRoles
_reRoles = re.compile(r'(<li>.*? \.\.\.\. )(.*?)(</li>|<br>)',
re.I | re.M | re.S)
def build_date(date):
day = date.get('day')
year = date.get('year')
if day and year:
return "%s %s" % (day, year)
if day:
return day
if year:
return year
return ""
class DOMHTMLMaindetailsParser(DOMParserBase):
"""Parser for the "categorized" (maindetails) page of a given person.
The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a
dictionary, with a key for every relevant section.
Example:
cparser = DOMHTMLMaindetailsParser()
result = cparser.parse(categorized_html_string)
"""
_containsObjects = True
_birth_attrs = [Attribute(key='birth date',
path={
'day': ".//a[starts-with(@href, " \
"'/date/')]/text()",
'year': ".//a[starts-with(@href, " \
"'/search/name?birth_year=')]/text()"
},
postprocess=build_date),
Attribute(key='birth place',
path=".//a[starts-with(@href, " \
"'/search/name?birth_place=')]/text()")]
_death_attrs = [Attribute(key='death date',
path={
'day': ".//a[starts-with(@href, " \
"'/date/')]/text()",
'year': ".//a[starts-with(@href, " \
"'/search/name?death_year=')]/text()"
},
postprocess=build_date),
Attribute(key='death place',
path=".//a[starts-with(@href, " \
"'/search/name?death_place=')]/text()")]
_film_attrs = [Attribute(key=None,
multi=True,
path={
'link': "./b/a[1]/@href",
'title': "./b/a[1]/text()",
'notes': "./b/following-sibling::text()",
'year': "./span[@class='year_column']/text()",
'status': "./a[@class='in_production']/text()",
'rolesNoChar': './/br/following-sibling::text()',
'chrRoles': "./a[@imdbpyname]/@imdbpyname",
'roleID': "./a[starts-with(@href, '/character/')]/@href"
},
postprocess=lambda x:
build_movie(x.get('title') or u'',
year=x.get('year'),
movieID=analyze_imdbid(x.get('link') or u''),
rolesNoChar=(x.get('rolesNoChar') or u'').strip(),
chrRoles=(x.get('chrRoles') or u'').strip(),
additionalNotes=x.get('notes'),
roleID=(x.get('roleID') or u''),
status=x.get('status') or None))]
extractors = [
Extractor(label='name',
path="//h1[@class='header']",
attrs=Attribute(key='name',
path=".//text()",
postprocess=lambda x: analyze_name(x,
canonical=1))),
Extractor(label='birth info',
path="//div[h4='Born:']",
attrs=_birth_attrs),
Extractor(label='death info',
path="//div[h4='Died:']",
attrs=_death_attrs),
Extractor(label='headshot',
path="//td[@id='img_primary']/a",
attrs=Attribute(key='headshot',
path="./img/@src")),
Extractor(label='akas',
path="//div[h4='Alternate Names:']",
attrs=Attribute(key='akas',
path="./text()",
postprocess=lambda x: x.strip().split(' '))),
Extractor(label='filmography',
group="//div[starts-with(@id, 'filmo-head-')]",
group_key="./a[@name]/text()",
group_key_normalize=lambda x: x.lower().replace(': ', ' '),
path="./following-sibling::div[1]" \
"/div[starts-with(@class, 'filmo-row')]",
attrs=_film_attrs),
Extractor(label='indevelopment',
path="//div[starts-with(@class,'devitem')]",
attrs=Attribute(key='in development',
multi=True,
path={
'link': './a/@href',
'title': './a/text()'
},
postprocess=lambda x:
build_movie(x.get('title') or u'',
movieID=analyze_imdbid(x.get('link') or u''),
roleID=(x.get('roleID') or u'').split('/'),
status=x.get('status') or None)))
]
preprocessors = [('<div class="clear"/> </div>', ''),
('<br/>', '<br />'),
(re.compile(r'(<a href="/character/ch[0-9]{7}")>(.*?)</a>'),
r'\1 imdbpyname="\2@@">\2</a>')]
def postprocess_data(self, data):
for what in 'birth date', 'death date':
if what in data and not data[what]:
del data[what]
# XXX: the code below is for backwards compatibility
# probably could be removed
for key in data.keys():
if key.startswith('actor '):
if not data.has_key('actor'):
data['actor'] = []
data['actor'].extend(data[key])
del data[key]
if key.startswith('actress '):
if not data.has_key('actress'):
data['actress'] = []
data['actress'].extend(data[key])
del data[key]
if key.startswith('self '):
if not data.has_key('self'):
data['self'] = []
data['self'].extend(data[key])
del data[key]
if key == 'birth place':
data['birth notes'] = data[key]
del data[key]
if key == 'death place':
data['death notes'] = data[key]
del data[key]
return data
class DOMHTMLBioParser(DOMParserBase):
"""Parser for the "biography" page of a given person.
The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a
dictionary, with a key for every relevant section.
Example:
bioparser = DOMHTMLBioParser()
result = bioparser.parse(biography_html_string)
"""
_defGetRefs = True
_birth_attrs = [Attribute(key='birth date',
path={
'day': "./a[starts-with(@href, " \
"'/date/')]/text()",
'year': "./a[starts-with(@href, " \
"'/search/name?birth_year=')]/text()"
},
postprocess=build_date),
Attribute(key='birth notes',
path="./a[starts-with(@href, " \
"'/search/name?birth_place=')]/text()")]
_death_attrs = [Attribute(key='death date',
path={
'day': "./a[starts-with(@href, " \
"'/date/')]/text()",
'year': "./a[starts-with(@href, " \
"'/search/name?death_date=')]/text()"
},
postprocess=build_date),
Attribute(key='death notes',
path="./text()",
# TODO: check if this slicing is always correct
postprocess=lambda x: u''.join(x).strip()[2:])]
extractors = [
Extractor(label='headshot',
path="//a[@name='headshot']",
attrs=Attribute(key='headshot',
path="./img/@src")),
Extractor(label='birth info',
path="//div[h5='Date of Birth']",
attrs=_birth_attrs),
Extractor(label='death info',
path="//div[h5='Date of Death']",
attrs=_death_attrs),
Extractor(label='nick names',
path="//div[h5='Nickname']",
attrs=Attribute(key='nick names',
path="./text()",
joiner='|',
postprocess=lambda x: [n.strip().replace(' (',
'::(', 1) for n in x.split('|')
if n.strip()])),
Extractor(label='birth name',
path="//div[h5='Birth Name']",
attrs=Attribute(key='birth name',
path="./text()",
postprocess=lambda x: canonicalName(x.strip()))),
Extractor(label='height',
path="//div[h5='Height']",
attrs=Attribute(key='height',
path="./text()",
postprocess=lambda x: x.strip())),
Extractor(label='mini biography',
path="//div[h5='Mini Biography']",
attrs=Attribute(key='mini biography',
multi=True,
path={
'bio': "./p//text()",
'by': "./b/following-sibling::a/text()"
},
postprocess=lambda x: "%s::%s" % \
(x.get('bio').strip(),
(x.get('by') or u'').strip() or u'Anonymous'))),
Extractor(label='spouse',
path="//div[h5='Spouse']/table/tr",
attrs=Attribute(key='spouse',
multi=True,
path={
'name': "./td[1]//text()",
'info': "./td[2]//text()"
},
postprocess=lambda x: ("%s::%s" % \
(x.get('name').strip(),
(x.get('info') or u'').strip())).strip(':'))),
Extractor(label='trade mark',
path="//div[h5='Trade Mark']/p",
attrs=Attribute(key='trade mark',
multi=True,
path=".//text()",
postprocess=lambda x: x.strip())),
Extractor(label='trivia',
path="//div[h5='Trivia']/p",
attrs=Attribute(key='trivia',
multi=True,
path=".//text()",
postprocess=lambda x: x.strip())),
Extractor(label='quotes',
path="//div[h5='Personal Quotes']/p",
attrs=Attribute(key='quotes',
multi=True,
path=".//text()",
postprocess=lambda x: x.strip())),
Extractor(label='salary',
path="//div[h5='Salary']/table/tr",
attrs=Attribute(key='salary history',
multi=True,
path={
'title': "./td[1]//text()",
'info': "./td[2]/text()",
},
postprocess=lambda x: "%s::%s" % \
(x.get('title').strip(),
x.get('info').strip()))),
Extractor(label='where now',
path="//div[h5='Where Are They Now']/p",
attrs=Attribute(key='where now',
multi=True,
path=".//text()",
postprocess=lambda x: x.strip())),
]
preprocessors = [
(re.compile('(<h5>)', re.I), r'</div><div class="_imdbpy">\1'),
(re.compile('(</table>\n</div>\s+)</div>', re.I + re.DOTALL), r'\1'),
(re.compile('(<div id="tn15bot">)'), r'</div>\1'),
(re.compile('\.<br><br>([^\s])', re.I), r'. \1')
]
def postprocess_data(self, data):
for what in 'birth date', 'death date':
if what in data and not data[what]:
del data[what]
return data
class DOMHTMLOtherWorksParser(DOMParserBase):
"""Parser for the "other works" and "agent" pages of a given person.
The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a
dictionary, with a key for every relevant section.
Example:
owparser = DOMHTMLOtherWorksParser()
result = owparser.parse(otherworks_html_string)
"""
_defGetRefs = True
kind = 'other works'
# XXX: looks like the 'agent' page is no more public.
extractors = [
Extractor(label='other works',
path="//h5[text()='Other works']/" \
"following-sibling::div[1]",
attrs=Attribute(key='self.kind',
path=".//text()",
postprocess=lambda x: x.strip().split('\n\n')))
]
preprocessors = [
(re.compile('(<h5>[^<]+</h5>)', re.I),
r'</div>\1<div class="_imdbpy">'),
(re.compile('(</table>\n</div>\s+)</div>', re.I), r'\1'),
(re.compile('(<div id="tn15bot">)'), r'</div>\1'),
(re.compile('<br/><br/>', re.I), r'\n\n')
]
def _build_episode(link, title, minfo, role, roleA, roleAID):
"""Build an Movie object for a given episode of a series."""
episode_id = analyze_imdbid(link)
notes = u''
minidx = minfo.find(' -')
# Sometimes, for some unknown reason, the role is left in minfo.
if minidx != -1:
slfRole = minfo[minidx+3:].lstrip()
minfo = minfo[:minidx].rstrip()
if slfRole.endswith(')'):
commidx = slfRole.rfind('(')
if commidx != -1:
notes = slfRole[commidx:]
slfRole = slfRole[:commidx]
if slfRole and role is None and roleA is None:
role = slfRole
eps_data = analyze_title(title)
eps_data['kind'] = u'episode'
# FIXME: it's wrong for multiple characters (very rare on tv series?).
if role is None:
role = roleA # At worse, it's None.
if role is None:
roleAID = None
if roleAID is not None:
roleAID = analyze_imdbid(roleAID)
e = Movie(movieID=episode_id, data=eps_data, currentRole=role,
roleID=roleAID, notes=notes)
# XXX: are we missing some notes?
# XXX: does it parse things as "Episode dated 12 May 2005 (12 May 2005)"?
if minfo.startswith('('):
pe = minfo.find(')')
if pe != -1:
date = minfo[1:pe]
if date != '????':
e['original air date'] = date
if eps_data.get('year', '????') == '????':
syear = date.split()[-1]
if syear.isdigit():
e['year'] = int(syear)
return e
class DOMHTMLSeriesParser(DOMParserBase):
"""Parser for the "by TV series" page of a given person.
The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a
dictionary, with a key for every relevant section.
Example:
sparser = DOMHTMLSeriesParser()
result = sparser.parse(filmoseries_html_string)
"""
_containsObjects = True
extractors = [
Extractor(label='series',
group="//div[@class='filmo']/span[1]",
group_key="./a[1]",
path="./following-sibling::ol[1]/li/a[1]",
attrs=Attribute(key=None,
multi=True,
path={
'link': "./@href",
'title': "./text()",
'info': "./following-sibling::text()",
'role': "./following-sibling::i[1]/text()",
'roleA': "./following-sibling::a[1]/text()",
'roleAID': "./following-sibling::a[1]/@href"
},
postprocess=lambda x: _build_episode(x.get('link'),
x.get('title'),
(x.get('info') or u'').strip(),
x.get('role'),
x.get('roleA'),
x.get('roleAID'))))
]
def postprocess_data(self, data):
if len(data) == 0:
return {}
nd = {}
for key in data.keys():
dom = self.get_dom(key)
link = self.xpath(dom, "//a/@href")[0]
title = self.xpath(dom, "//a/text()")[0][1:-1]
series = Movie(movieID=analyze_imdbid(link),
data=analyze_title(title),
accessSystem=self._as, modFunct=self._modFunct)
nd[series] = []
for episode in data[key]:
# XXX: should we create a copy of 'series', to avoid
# circular references?
episode['episode of'] = series
nd[series].append(episode)
return {'episodes': nd}
class DOMHTMLPersonGenresParser(DOMParserBase):
"""Parser for the "by genre" and "by keywords" pages of a given person.
The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a
dictionary, with a key for every relevant section.
Example:
gparser = DOMHTMLPersonGenresParser()
result = gparser.parse(bygenre_html_string)
"""
kind = 'genres'
_containsObjects = True
extractors = [
Extractor(label='genres',
group="//b/a[@name]/following-sibling::a[1]",
group_key="./text()",
group_key_normalize=lambda x: x.lower(),
path="../../following-sibling::ol[1]/li//a[1]",
attrs=Attribute(key=None,
multi=True,
path={
'link': "./@href",
'title': "./text()",
'info': "./following-sibling::text()"
},
postprocess=lambda x: \
build_movie(x.get('title') + \
x.get('info').split('[')[0],
analyze_imdbid(x.get('link')))))
]
def postprocess_data(self, data):
if len(data) == 0:
return {}
return {self.kind: data}
from movieParser import _parse_merchandising_link
class DOMHTMLPersonSalesParser(DOMParserBase):
"""Parser for the "merchandising links" page of a given person.
The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a
dictionary, with a key for every relevant section.
Example:
sparser = DOMHTMLPersonSalesParser()
result = sparser.parse(sales_html_string)
"""
extractors = [
Extractor(label='merchandising links',
group="//span[@class='merch_title']",
group_key=".//text()",
path="./following-sibling::table[1]/" \
"/td[@class='w_rowtable_colshop']//tr[1]",
attrs=Attribute(key=None,
multi=True,
path={
'link': "./td[2]/a[1]/@href",
'text': "./td[1]/img[1]/@alt",
'cover': "./ancestor::td[1]/../" \
"td[1]/a[1]/img[1]/@src",
},
postprocess=_parse_merchandising_link)),
]
preprocessors = [
(re.compile('(<a name="[^"]+" )/>', re.I), r'\1></a>')
]
def postprocess_data(self, data):
if len(data) == 0:
return {}
return {'merchandising links': data}
from movieParser import DOMHTMLTechParser
from movieParser import DOMHTMLOfficialsitesParser
from movieParser import DOMHTMLAwardsParser
from movieParser import DOMHTMLNewsParser
_OBJECTS = {
'maindetails_parser': ((DOMHTMLMaindetailsParser,), None),
'bio_parser': ((DOMHTMLBioParser,), None),
'otherworks_parser': ((DOMHTMLOtherWorksParser,), None),
#'agent_parser': ((DOMHTMLOtherWorksParser,), {'kind': 'agent'}),
'person_officialsites_parser': ((DOMHTMLOfficialsitesParser,), None),
'person_awards_parser': ((DOMHTMLAwardsParser,), {'subject': 'name'}),
'publicity_parser': ((DOMHTMLTechParser,), {'kind': 'publicity'}),
'person_series_parser': ((DOMHTMLSeriesParser,), None),
'person_contacts_parser': ((DOMHTMLTechParser,), {'kind': 'contacts'}),
'person_genres_parser': ((DOMHTMLPersonGenresParser,), None),
'person_keywords_parser': ((DOMHTMLPersonGenresParser,),
{'kind': 'keywords'}),
'news_parser': ((DOMHTMLNewsParser,), None),
'sales_parser': ((DOMHTMLPersonSalesParser,), None)
}
@@ -1,69 +0,0 @@
"""
parser.http.searchCharacterParser module (imdb package).
This module provides the HTMLSearchCharacterParser class (and the
search_character_parser instance), used to parse the results of a search
for a given character.
E.g., when searching for the name "Jesse James", the parsed page would be:
http://akas.imdb.com/find?s=Characters;mx=20;q=Jesse+James
Copyright 2007-2009 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
from imdb.utils import analyze_name, build_name
from utils import Extractor, Attribute, analyze_imdbid
from searchMovieParser import DOMHTMLSearchMovieParser, DOMBasicMovieParser
class DOMBasicCharacterParser(DOMBasicMovieParser):
"""Simply get the name of a character and the imdbID.
It's used by the DOMHTMLSearchCharacterParser class to return a result
for a direct match (when a search on IMDb results in a single
character, the web server sends directly the movie page."""
_titleFunct = lambda self, x: analyze_name(x or u'', canonical=False)
class DOMHTMLSearchCharacterParser(DOMHTMLSearchMovieParser):
_BaseParser = DOMBasicCharacterParser
_notDirectHitTitle = '<title>imdb search'
_titleBuilder = lambda self, x: build_name(x, canonical=False)
_linkPrefix = '/character/ch'
_attrs = [Attribute(key='data',
multi=True,
path={
'link': "./a[1]/@href",
'name': "./a[1]/text()"
},
postprocess=lambda x: (
analyze_imdbid(x.get('link') or u''),
{'name': x.get('name')}
))]
extractors = [Extractor(label='search',
path="//td[3]/a[starts-with(@href, " \
"'/character/ch')]/..",
attrs=_attrs)]
_OBJECTS = {
'search_character_parser': ((DOMHTMLSearchCharacterParser,),
{'kind': 'character', '_basic_parser': DOMBasicCharacterParser})
}
@@ -1,71 +0,0 @@
"""
parser.http.searchCompanyParser module (imdb package).
This module provides the HTMLSearchCompanyParser class (and the
search_company_parser instance), used to parse the results of a search
for a given company.
E.g., when searching for the name "Columbia Pictures", the parsed page would be:
http://akas.imdb.com/find?s=co;mx=20;q=Columbia+Pictures
Copyright 2008-2009 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
from imdb.utils import analyze_company_name, build_company_name
from utils import Extractor, Attribute, analyze_imdbid
from searchMovieParser import DOMHTMLSearchMovieParser, DOMBasicMovieParser
class DOMBasicCompanyParser(DOMBasicMovieParser):
"""Simply get the name of a company and the imdbID.
It's used by the DOMHTMLSearchCompanyParser class to return a result
for a direct match (when a search on IMDb results in a single
company, the web server sends directly the company page.
"""
_titleFunct = lambda self, x: analyze_company_name(x or u'')
class DOMHTMLSearchCompanyParser(DOMHTMLSearchMovieParser):
_BaseParser = DOMBasicCompanyParser
_notDirectHitTitle = '<title>imdb company'
_titleBuilder = lambda self, x: build_company_name(x)
_linkPrefix = '/company/co'
_attrs = [Attribute(key='data',
multi=True,
path={
'link': "./a[1]/@href",
'name': "./a[1]/text()",
'notes': "./text()[1]"
},
postprocess=lambda x: (
analyze_imdbid(x.get('link')),
analyze_company_name(x.get('name')+(x.get('notes')
or u''), stripNotes=True)
))]
extractors = [Extractor(label='search',
path="//td[3]/a[starts-with(@href, " \
"'/company/co')]/..",
attrs=_attrs)]
_OBJECTS = {
'search_company_parser': ((DOMHTMLSearchCompanyParser,),
{'kind': 'company', '_basic_parser': DOMBasicCompanyParser})
}
@@ -1,111 +0,0 @@
"""
parser.http.searchKeywordParser module (imdb package).
This module provides the HTMLSearchKeywordParser class (and the
search_company_parser instance), used to parse the results of a search
for a given keyword.
E.g., when searching for the keyword "alabama", the parsed page would be:
http://akas.imdb.com/find?s=kw;mx=20;q=alabama
Copyright 2009 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
from utils import Extractor, Attribute, analyze_imdbid
from imdb.utils import analyze_title, analyze_company_name
from searchMovieParser import DOMHTMLSearchMovieParser, DOMBasicMovieParser
class DOMBasicKeywordParser(DOMBasicMovieParser):
"""Simply get the name of a keyword.
It's used by the DOMHTMLSearchKeywordParser class to return a result
for a direct match (when a search on IMDb results in a single
keyword, the web server sends directly the keyword page.
"""
# XXX: it's still to be tested!
# I'm not even sure there can be a direct hit, searching for keywords.
_titleFunct = lambda self, x: analyze_company_name(x or u'')
class DOMHTMLSearchKeywordParser(DOMHTMLSearchMovieParser):
"""Parse the html page that the IMDb web server shows when the
"new search system" is used, searching for keywords similar to
the one given."""
_BaseParser = DOMBasicKeywordParser
_notDirectHitTitle = '<title>imdb keyword'
_titleBuilder = lambda self, x: x
_linkPrefix = '/keyword/'
_attrs = [Attribute(key='data',
multi=True,
path="./a[1]/text()"
)]
extractors = [Extractor(label='search',
path="//td[3]/a[starts-with(@href, " \
"'/keyword/')]/..",
attrs=_attrs)]
def custom_analyze_title4kwd(title, yearNote, outline):
"""Return a dictionary with the needed info."""
title = title.strip()
if not title:
return {}
if yearNote:
yearNote = '%s)' % yearNote.split(' ')[0]
title = title + ' ' + yearNote
retDict = analyze_title(title)
if outline:
retDict['plot outline'] = outline
return retDict
class DOMHTMLSearchMovieKeywordParser(DOMHTMLSearchMovieParser):
"""Parse the html page that the IMDb web server shows when the
"new search system" is used, searching for movies with the given
keyword."""
_notDirectHitTitle = '<title>best'
_attrs = [Attribute(key='data',
multi=True,
path={
'link': "./a[1]/@href",
'info': "./a[1]//text()",
'ynote': "./span[@class='desc']/text()",
'outline': "./span[@class='outline']//text()"
},
postprocess=lambda x: (
analyze_imdbid(x.get('link') or u''),
custom_analyze_title4kwd(x.get('info') or u'',
x.get('ynote') or u'',
x.get('outline') or u'')
))]
extractors = [Extractor(label='search',
path="//td[3]/a[starts-with(@href, " \
"'/title/tt')]/..",
attrs=_attrs)]
_OBJECTS = {
'search_keyword_parser': ((DOMHTMLSearchKeywordParser,),
{'kind': 'keyword', '_basic_parser': DOMBasicKeywordParser}),
'search_moviekeyword_parser': ((DOMHTMLSearchMovieKeywordParser,), None)
}
-178
View File
@@ -1,178 +0,0 @@
"""
parser.http.searchMovieParser module (imdb package).
This module provides the HTMLSearchMovieParser class (and the
search_movie_parser instance), used to parse the results of a search
for a given title.
E.g., for when searching for the title "the passion", the parsed
page would be:
http://akas.imdb.com/find?q=the+passion&tt=on&mx=20
Copyright 2004-2010 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
import re
from imdb.utils import analyze_title, build_title
from utils import DOMParserBase, Attribute, Extractor, analyze_imdbid
class DOMBasicMovieParser(DOMParserBase):
"""Simply get the title of a movie and the imdbID.
It's used by the DOMHTMLSearchMovieParser class to return a result
for a direct match (when a search on IMDb results in a single
movie, the web server sends directly the movie page."""
# Stay generic enough to be used also for other DOMBasic*Parser classes.
_titleAttrPath = ".//text()"
_linkPath = "//link[@rel='canonical']"
_titleFunct = lambda self, x: analyze_title(x or u'')
def _init(self):
self.preprocessors += [('<span class="tv-extra">TV mini-series</span>',
'<span class="tv-extra">(mini)</span>')]
self.extractors = [Extractor(label='title',
path="//h1",
attrs=Attribute(key='title',
path=self._titleAttrPath,
postprocess=self._titleFunct)),
Extractor(label='link',
path=self._linkPath,
attrs=Attribute(key='link', path="./@href",
postprocess=lambda x: \
analyze_imdbid((x or u'').replace(
'http://pro.imdb.com', ''))
))]
# Remove 'More at IMDb Pro' links.
preprocessors = [(re.compile(r'<span class="pro-link".*?</span>'), ''),
(re.compile(r'<a href="http://ad.doubleclick.net.*?;id=(co[0-9]{7});'), r'<a href="http://pro.imdb.com/company/\1"></a>< a href="')]
def postprocess_data(self, data):
if not 'link' in data:
data = []
else:
link = data.pop('link')
if (link and data):
data = [(link, data)]
else:
data = []
return data
def custom_analyze_title(title):
"""Remove garbage notes after the (year), (year/imdbIndex) or (year) (TV)"""
# XXX: very crappy. :-(
nt = title.split(' ')[0]
if nt:
title = nt
if not title:
return {}
return analyze_title(title)
# Manage AKAs.
_reAKAStitles = re.compile(r'(?:aka) <em>"(.*?)(<br>|<\/td>)', re.I | re.M)
class DOMHTMLSearchMovieParser(DOMParserBase):
"""Parse the html page that the IMDb web server shows when the
"new search system" is used, for movies."""
_BaseParser = DOMBasicMovieParser
_notDirectHitTitle = '<title>imdb title'
_titleBuilder = lambda self, x: build_title(x)
_linkPrefix = '/title/tt'
_attrs = [Attribute(key='data',
multi=True,
path={
'link': "./a[1]/@href",
'info': ".//text()",
#'akas': ".//div[@class='_imdbpyAKA']//text()"
'akas': ".//p[@class='find-aka']//text()"
},
postprocess=lambda x: (
analyze_imdbid(x.get('link') or u''),
custom_analyze_title(x.get('info') or u''),
x.get('akas')
))]
extractors = [Extractor(label='search',
path="//td[3]/a[starts-with(@href, '/title/tt')]/..",
attrs=_attrs)]
def _init(self):
self.url = u''
def _reset(self):
self.url = u''
def preprocess_string(self, html_string):
if self._notDirectHitTitle in html_string[:1024].lower():
if self._linkPrefix == '/title/tt':
# Only for movies.
html_string = html_string.replace('(TV mini-series)', '(mini)')
html_string = html_string.replace('<p class="find-aka">',
'<p class="find-aka">::')
#html_string = _reAKAStitles.sub(
# r'<div class="_imdbpyAKA">\1::</div>\2', html_string)
return html_string
# Direct hit!
dbme = self._BaseParser(useModule=self._useModule)
res = dbme.parse(html_string, url=self.url)
if not res: return u''
res = res['data']
if not (res and res[0]): return u''
link = '%s%s' % (self._linkPrefix, res[0][0])
# # Tries to cope with companies for which links to pro.imdb.com
# # are missing.
# link = self.url.replace(imdbURL_base[:-1], '')
title = self._titleBuilder(res[0][1])
if not (link and title): return u''
link = link.replace('http://pro.imdb.com', '')
new_html = '<td></td><td></td><td><a href="%s">%s</a></td>' % (link,
title)
return new_html
def postprocess_data(self, data):
if not data.has_key('data'):
data['data'] = []
results = getattr(self, 'results', None)
if results is not None:
data['data'][:] = data['data'][:results]
# Horrible hack to support AKAs.
if data and data['data'] and len(data['data'][0]) == 3 and \
isinstance(data['data'][0], tuple):
for idx, datum in enumerate(data['data']):
if not isinstance(datum, tuple):
continue
if datum[2] is not None:
akas = filter(None, datum[2].split('::'))
if self._linkPrefix == '/title/tt':
akas = [a.replace('" - ', '::').rstrip() for a in akas]
akas = [a.replace('aka "', '', 1).lstrip() for a in akas]
datum[1]['akas'] = akas
data['data'][idx] = (datum[0], datum[1])
else:
data['data'][idx] = (datum[0], datum[1])
return data
def add_refs(self, data):
return data
_OBJECTS = {
'search_movie_parser': ((DOMHTMLSearchMovieParser,), None)
}
@@ -1,92 +0,0 @@
"""
parser.http.searchPersonParser module (imdb package).
This module provides the HTMLSearchPersonParser class (and the
search_person_parser instance), used to parse the results of a search
for a given person.
E.g., when searching for the name "Mel Gibson", the parsed page would be:
http://akas.imdb.com/find?q=Mel+Gibson&nm=on&mx=20
Copyright 2004-2010 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
import re
from imdb.utils import analyze_name, build_name
from utils import Extractor, Attribute, analyze_imdbid
from searchMovieParser import DOMHTMLSearchMovieParser, DOMBasicMovieParser
def _cleanName(n):
"""Clean the name in a title tag."""
if not n:
return u''
n = n.replace('Filmography by type for', '') # FIXME: temporary.
return n
class DOMBasicPersonParser(DOMBasicMovieParser):
"""Simply get the name of a person and the imdbID.
It's used by the DOMHTMLSearchPersonParser class to return a result
for a direct match (when a search on IMDb results in a single
person, the web server sends directly the movie page."""
_titleFunct = lambda self, x: analyze_name(_cleanName(x), canonical=1)
_reAKASp = re.compile(r'(?:aka|birth name) (<em>")(.*?)"(<br>|<\/em>|<\/td>)',
re.I | re.M)
class DOMHTMLSearchPersonParser(DOMHTMLSearchMovieParser):
"""Parse the html page that the IMDb web server shows when the
"new search system" is used, for persons."""
_BaseParser = DOMBasicPersonParser
_notDirectHitTitle = '<title>imdb name'
_titleBuilder = lambda self, x: build_name(x, canonical=True)
_linkPrefix = '/name/nm'
_attrs = [Attribute(key='data',
multi=True,
path={
'link': "./a[1]/@href",
'name': "./a[1]/text()",
'index': "./text()[1]",
'akas': ".//div[@class='_imdbpyAKA']/text()"
},
postprocess=lambda x: (
analyze_imdbid(x.get('link') or u''),
analyze_name((x.get('name') or u'') + \
(x.get('index') or u''),
canonical=1), x.get('akas')
))]
extractors = [Extractor(label='search',
path="//td[3]/a[starts-with(@href, '/name/nm')]/..",
attrs=_attrs)]
def preprocess_string(self, html_string):
if self._notDirectHitTitle in html_string[:1024].lower():
html_string = _reAKASp.sub(
r'\1<div class="_imdbpyAKA">\2::</div>\3',
html_string)
return DOMHTMLSearchMovieParser.preprocess_string(self, html_string)
_OBJECTS = {
'search_person_parser': ((DOMHTMLSearchPersonParser,),
{'kind': 'person', '_basic_parser': DOMBasicPersonParser})
}
-106
View File
@@ -1,106 +0,0 @@
"""
parser.http.topBottomParser module (imdb package).
This module provides the classes (and the instances), used to parse the
lists of top 250 and bottom 100 movies.
E.g.:
http://akas.imdb.com/chart/top
http://akas.imdb.com/chart/bottom
Copyright 2009 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
from imdb.utils import analyze_title
from utils import DOMParserBase, Attribute, Extractor, analyze_imdbid
class DOMHTMLTop250Parser(DOMParserBase):
"""Parser for the "top 250" page.
The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a
dictionary, with a key for every relevant section.
Example:
tparser = DOMHTMLTop250Parser()
result = tparser.parse(top250_html_string)
"""
label = 'top 250'
ranktext = 'top 250 rank'
def _init(self):
self.extractors = [Extractor(label=self.label,
path="//div[@id='main']//table//tr",
attrs=Attribute(key=None,
multi=True,
path={self.ranktext: "./td[1]//text()",
'rating': "./td[2]//text()",
'title': "./td[3]//text()",
'movieID': "./td[3]//a/@href",
'votes': "./td[4]//text()"
}))]
def postprocess_data(self, data):
if not data or self.label not in data:
return []
mlist = []
data = data[self.label]
# Avoid duplicates. A real fix, using XPath, is auspicabile.
# XXX: probably this is no more needed.
seenIDs = []
for d in data:
if 'movieID' not in d: continue
if self.ranktext not in d: continue
if 'title' not in d: continue
theID = analyze_imdbid(d['movieID'])
if theID is None:
continue
theID = str(theID)
if theID in seenIDs:
continue
seenIDs.append(theID)
minfo = analyze_title(d['title'])
try: minfo[self.ranktext] = int(d[self.ranktext].replace('.', ''))
except: pass
if 'votes' in d:
try: minfo['votes'] = int(d['votes'].replace(',', ''))
except: pass
if 'rating' in d:
try: minfo['rating'] = float(d['rating'])
except: pass
mlist.append((theID, minfo))
return mlist
class DOMHTMLBottom100Parser(DOMHTMLTop250Parser):
"""Parser for the "bottom 100" page.
The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a
dictionary, with a key for every relevant section.
Example:
tparser = DOMHTMLBottom100Parser()
result = tparser.parse(bottom100_html_string)
"""
label = 'bottom 100'
ranktext = 'bottom 100 rank'
_OBJECTS = {
'top250_parser': ((DOMHTMLTop250Parser,), None),
'bottom100_parser': ((DOMHTMLBottom100Parser,), None)
}
-855
View File
@@ -1,855 +0,0 @@
"""
parser.http.utils module (imdb package).
This module provides miscellaneous utilities used by
the imdb.parser.http classes.
Copyright 2004-2010 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
import re
import logging
from imdb._exceptions import IMDbError
from imdb.utils import flatten, _Container
from imdb.Movie import Movie
from imdb.Person import Person
from imdb.Character import Character
# Year, imdbIndex and kind.
re_yearKind_index = re.compile(r'(\([0-9\?]{4}(?:/[IVXLCDM]+)?\)(?: \(mini\)| \(TV\)| \(V\)| \(VG\))?)')
# Match imdb ids in href tags
re_imdbid = re.compile(r'(title/tt|name/nm|character/ch|company/co)([0-9]+)')
def analyze_imdbid(href):
"""Return an imdbID from an URL."""
if not href:
return None
match = re_imdbid.search(href)
if not match:
return None
return str(match.group(2))
_modify_keys = list(Movie.keys_tomodify_list) + list(Person.keys_tomodify_list)
def _putRefs(d, re_titles, re_names, re_characters, lastKey=None):
"""Iterate over the strings inside list items or dictionary values,
substitutes movie titles and person names with the (qv) references."""
if isinstance(d, list):
for i in xrange(len(d)):
if isinstance(d[i], (unicode, str)):
if lastKey in _modify_keys:
if re_names:
d[i] = re_names.sub(ur"'\1' (qv)", d[i])
if re_titles:
d[i] = re_titles.sub(ur'_\1_ (qv)', d[i])
if re_characters:
d[i] = re_characters.sub(ur'#\1# (qv)', d[i])
elif isinstance(d[i], (list, dict)):
_putRefs(d[i], re_titles, re_names, re_characters,
lastKey=lastKey)
elif isinstance(d, dict):
for k, v in d.items():
lastKey = k
if isinstance(v, (unicode, str)):
if lastKey in _modify_keys:
if re_names:
d[k] = re_names.sub(ur"'\1' (qv)", v)
if re_titles:
d[k] = re_titles.sub(ur'_\1_ (qv)', v)
if re_characters:
d[k] = re_characters.sub(ur'#\1# (qv)', v)
elif isinstance(v, (list, dict)):
_putRefs(d[k], re_titles, re_names, re_characters,
lastKey=lastKey)
# Handle HTML/XML/SGML entities.
from htmlentitydefs import entitydefs
entitydefs = entitydefs.copy()
entitydefsget = entitydefs.get
entitydefs['nbsp'] = ' '
sgmlentity = {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
sgmlentityget = sgmlentity.get
_sgmlentkeys = sgmlentity.keys()
entcharrefs = {}
entcharrefsget = entcharrefs.get
for _k, _v in entitydefs.items():
if _k in _sgmlentkeys: continue
if _v[0:2] == '&#':
dec_code = _v[1:-1]
_v = unichr(int(_v[2:-1]))
entcharrefs[dec_code] = _v
else:
dec_code = '#' + str(ord(_v))
_v = unicode(_v, 'latin_1', 'replace')
entcharrefs[dec_code] = _v
entcharrefs[_k] = _v
del _sgmlentkeys, _k, _v
entcharrefs['#160'] = u' '
entcharrefs['#xA0'] = u' '
entcharrefs['#xa0'] = u' '
entcharrefs['#XA0'] = u' '
entcharrefs['#x22'] = u'"'
entcharrefs['#X22'] = u'"'
# convert &x26; to &amp;, to make BeautifulSoup happy; beware that this
# leaves lone '&' in the html broken, but I assume this is better than
# the contrary...
entcharrefs['#38'] = u'&amp;'
entcharrefs['#x26'] = u'&amp;'
entcharrefs['#x26'] = u'&amp;'
re_entcharrefs = re.compile('&(%s|\#160|\#\d{1,5}|\#x[0-9a-f]{1,4});' %
'|'.join(map(re.escape, entcharrefs)), re.I)
re_entcharrefssub = re_entcharrefs.sub
sgmlentity.update(dict([('#34', u'"'), ('#38', u'&'),
('#60', u'<'), ('#62', u'>'), ('#39', u"'")]))
re_sgmlref = re.compile('&(%s);' % '|'.join(map(re.escape, sgmlentity)))
re_sgmlrefsub = re_sgmlref.sub
# Matches XML-only single tags, like <br/> ; they are invalid in HTML,
# but widely used by IMDb web site. :-/
re_xmltags = re.compile('<([a-zA-Z]+)/>')
def _replXMLRef(match):
"""Replace the matched XML/HTML entities and references;
replace everything except sgml entities like &lt;, &gt;, ..."""
ref = match.group(1)
value = entcharrefsget(ref)
if value is None:
if ref[0] == '#':
ref_code = ref[1:]
if ref_code in ('34', '38', '60', '62', '39'):
return match.group(0)
elif ref_code[0].lower() == 'x':
#if ref[2:] == '26':
# # Don't convert &x26; to &amp;, to make BeautifulSoup happy.
# return '&amp;'
return unichr(int(ref[2:], 16))
else:
return unichr(int(ref[1:]))
else:
return ref
return value
def subXMLRefs(s):
"""Return the given html string with entity and char references
replaced."""
return re_entcharrefssub(_replXMLRef, s)
# XXX: no more used here; move it to mobile (they are imported by helpers, too)?
def _replSGMLRefs(match):
"""Replace the matched SGML entity."""
ref = match.group(1)
return sgmlentityget(ref, ref)
def subSGMLRefs(s):
"""Return the given html string with sgml entity and char references
replaced."""
return re_sgmlrefsub(_replSGMLRefs, s)
_b_p_logger = logging.getLogger('imdbpy.parser.http.build_person')
def build_person(txt, personID=None, billingPos=None,
roleID=None, accessSystem='http', modFunct=None):
"""Return a Person instance from the tipical <tr>...</tr> strings
found in the IMDb's web site."""
#if personID is None
# _b_p_logger.debug('empty name or personID for "%s"', txt)
notes = u''
role = u''
# Search the (optional) separator between name and role/notes.
if txt.find('....') != -1:
sep = '....'
elif txt.find('...') != -1:
sep = '...'
else:
sep = '...'
# Replace the first parenthesis, assuming there are only
# notes, after.
# Rationale: no imdbIndex is (ever?) showed on the web site.
txt = txt.replace('(', '...(', 1)
txt_split = txt.split(sep, 1)
name = txt_split[0].strip()
if len(txt_split) == 2:
role_comment = txt_split[1].strip()
# Strip common endings.
if role_comment[-4:] == ' and':
role_comment = role_comment[:-4].rstrip()
elif role_comment[-2:] == ' &':
role_comment = role_comment[:-2].rstrip()
elif role_comment[-6:] == '& ....':
role_comment = role_comment[:-6].rstrip()
# Get the notes.
if roleID is not None:
if not isinstance(roleID, list):
cmt_idx = role_comment.find('(')
if cmt_idx != -1:
role = role_comment[:cmt_idx].rstrip()
notes = role_comment[cmt_idx:]
else:
# Just a role, without notes.
role = role_comment
else:
role = role_comment
else:
# We're managing something that doesn't have a 'role', so
# everything are notes.
notes = role_comment
if role == '....': role = u''
roleNotes = []
# Manages multiple roleIDs.
if isinstance(roleID, list):
rolesplit = role.split('/')
role = []
for r in rolesplit:
nidx = r.find('(')
if nidx != -1:
role.append(r[:nidx].rstrip())
roleNotes.append(r[nidx:])
else:
role.append(r)
roleNotes.append(None)
lr = len(role)
lrid = len(roleID)
if lr > lrid:
roleID += [None] * (lrid - lr)
elif lr < lrid:
roleID = roleID[:lr]
for i, rid in enumerate(roleID):
if rid is not None:
roleID[i] = str(rid)
if lr == 1:
role = role[0]
roleID = roleID[0]
elif roleID is not None:
roleID = str(roleID)
if personID is not None:
personID = str(personID)
if (not name) or (personID is None):
# Set to 'debug', since build_person is expected to receive some crap.
_b_p_logger.debug('empty name or personID for "%s"', txt)
# XXX: return None if something strange is detected?
person = Person(name=name, personID=personID, currentRole=role,
roleID=roleID, notes=notes, billingPos=billingPos,
modFunct=modFunct, accessSystem=accessSystem)
if roleNotes and len(roleNotes) == len(roleID):
for idx, role in enumerate(person.currentRole):
if roleNotes[idx]:
role.notes = roleNotes[idx]
return person
_re_chrIDs = re.compile('[0-9]{7}')
_b_m_logger = logging.getLogger('imdbpy.parser.http.build_movie')
# To shrink spaces.
re_spaces = re.compile(r'\s+')
def build_movie(txt, movieID=None, roleID=None, status=None,
accessSystem='http', modFunct=None, _parsingCharacter=False,
_parsingCompany=False, year=None, chrRoles=None,
rolesNoChar=None, additionalNotes=None):
"""Given a string as normally seen on the "categorized" page of
a person on the IMDb's web site, returns a Movie instance."""
# FIXME: Oook, lets face it: build_movie and build_person are now
# two horrible sets of patches to support the new IMDb design. They
# must be rewritten from scratch.
if _parsingCharacter:
_defSep = ' Played by '
elif _parsingCompany:
_defSep = ' ... '
else:
_defSep = ' .... '
title = re_spaces.sub(' ', txt).strip()
# Split the role/notes from the movie title.
tsplit = title.split(_defSep, 1)
role = u''
notes = u''
roleNotes = []
if len(tsplit) == 2:
title = tsplit[0].rstrip()
role = tsplit[1].lstrip()
if title[-9:] == 'TV Series':
title = title[:-9].rstrip()
elif title[-14:] == 'TV mini-series':
title = title[:-14] + ' (mini)'
# Try to understand where the movie title ends.
while True:
if year:
break
if title[-1:] != ')':
# Ignore the silly "TV Series" notice.
if title[-9:] == 'TV Series':
title = title[:-9].rstrip()
continue
else:
# Just a title: stop here.
break
# Try to match paired parentheses; yes: sometimes there are
# parentheses inside comments...
nidx = title.rfind('(')
while (nidx != -1 and \
title[nidx:].count('(') != title[nidx:].count(')')):
nidx = title[:nidx].rfind('(')
# Unbalanced parentheses: stop here.
if nidx == -1: break
# The last item in parentheses seems to be a year: stop here.
first4 = title[nidx+1:nidx+5]
if (first4.isdigit() or first4 == '????') and \
title[nidx+5:nidx+6] in (')', '/'): break
# The last item in parentheses is a known kind: stop here.
if title[nidx+1:-1] in ('TV', 'V', 'mini', 'VG'): break
# Else, in parentheses there are some notes.
# XXX: should the notes in the role half be kept separated
# from the notes in the movie title half?
if notes: notes = '%s %s' % (title[nidx:], notes)
else: notes = title[nidx:]
title = title[:nidx].rstrip()
if year:
year = year.strip()
if title[-1] == ')':
fpIdx = title.rfind('(')
if fpIdx != -1:
if notes: notes = '%s %s' % (title[fpIdx:], notes)
else: notes = title[fpIdx:]
title = title[:fpIdx].rstrip()
title = u'%s (%s)' % (title, year)
if _parsingCharacter and roleID and not role:
roleID = None
if not roleID:
roleID = None
elif len(roleID) == 1:
roleID = roleID[0]
if not role and chrRoles and isinstance(roleID, (str, unicode)):
roleID = _re_chrIDs.findall(roleID)
role = ' / '.join(filter(None, chrRoles.split('@@')))
# Manages multiple roleIDs.
if isinstance(roleID, list):
tmprole = role.split('/')
role = []
for r in tmprole:
nidx = r.find('(')
if nidx != -1:
role.append(r[:nidx].rstrip())
roleNotes.append(r[nidx:])
else:
role.append(r)
roleNotes.append(None)
lr = len(role)
lrid = len(roleID)
if lr > lrid:
roleID += [None] * (lrid - lr)
elif lr < lrid:
roleID = roleID[:lr]
for i, rid in enumerate(roleID):
if rid is not None:
roleID[i] = str(rid)
if lr == 1:
role = role[0]
roleID = roleID[0]
elif roleID is not None:
roleID = str(roleID)
if movieID is not None:
movieID = str(movieID)
if (not title) or (movieID is None):
_b_m_logger.error('empty title or movieID for "%s"', txt)
if rolesNoChar:
rolesNoChar = filter(None, [x.strip() for x in rolesNoChar.split('/')])
if not role:
role = []
elif not isinstance(role, list):
role = [role]
role += rolesNoChar
notes = notes.strip()
if additionalNotes:
additionalNotes = re_spaces.sub(' ', additionalNotes).strip()
if notes:
notes += u' '
notes += additionalNotes
m = Movie(title=title, movieID=movieID, notes=notes, currentRole=role,
roleID=roleID, roleIsPerson=_parsingCharacter,
modFunct=modFunct, accessSystem=accessSystem)
if roleNotes and len(roleNotes) == len(roleID):
for idx, role in enumerate(m.currentRole):
try:
if roleNotes[idx]:
role.notes = roleNotes[idx]
except IndexError:
break
# Status can't be checked here, and must be detected by the parser.
if status:
m['status'] = status
return m
class DOMParserBase(object):
"""Base parser to handle HTML data from the IMDb's web server."""
_defGetRefs = False
_containsObjects = False
preprocessors = []
extractors = []
usingModule = None
_logger = logging.getLogger('imdbpy.parser.http.domparser')
def __init__(self, useModule=None):
"""Initialize the parser. useModule can be used to force it
to use 'BeautifulSoup' or 'lxml'; by default, it's auto-detected,
using 'lxml' if available and falling back to 'BeautifulSoup'
otherwise."""
# Module to use.
if useModule is None:
useModule = ('lxml', 'BeautifulSoup')
if not isinstance(useModule, (tuple, list)):
useModule = [useModule]
self._useModule = useModule
nrMods = len(useModule)
_gotError = False
for idx, mod in enumerate(useModule):
mod = mod.strip().lower()
try:
if mod == 'lxml':
from lxml.html import fromstring
from lxml.etree import tostring
self._is_xml_unicode = False
self.usingModule = 'lxml'
elif mod == 'beautifulsoup':
from bsouplxml.html import fromstring
from bsouplxml.etree import tostring
self._is_xml_unicode = True
self.usingModule = 'beautifulsoup'
else:
self._logger.warn('unknown module "%s"' % mod)
continue
self.fromstring = fromstring
self._tostring = tostring
if _gotError:
self._logger.warn('falling back to "%s"' % mod)
break
except ImportError, e:
if idx+1 >= nrMods:
# Raise the exception, if we don't have any more
# options to try.
raise IMDbError, 'unable to use any parser in %s: %s' % \
(str(useModule), str(e))
else:
self._logger.warn('unable to use "%s": %s' % (mod, str(e)))
_gotError = True
continue
else:
raise IMDbError, 'unable to use parsers in %s' % str(useModule)
# Fall-back defaults.
self._modFunct = None
self._as = 'http'
self._cname = self.__class__.__name__
self._init()
self.reset()
def reset(self):
"""Reset the parser."""
# Names and titles references.
self._namesRefs = {}
self._titlesRefs = {}
self._charactersRefs = {}
self._reset()
def _init(self):
"""Subclasses can override this method, if needed."""
pass
def _reset(self):
"""Subclasses can override this method, if needed."""
pass
def parse(self, html_string, getRefs=None, **kwds):
"""Return the dictionary generated from the given html string;
getRefs can be used to force the gathering of movies/persons/characters
references."""
self.reset()
if getRefs is not None:
self.getRefs = getRefs
else:
self.getRefs = self._defGetRefs
# Useful only for the testsuite.
if not isinstance(html_string, unicode):
html_string = unicode(html_string, 'latin_1', 'replace')
html_string = subXMLRefs(html_string)
# Temporary fix: self.parse_dom must work even for empty strings.
html_string = self.preprocess_string(html_string)
html_string = html_string.strip()
# tag attributes like title="&#x22;Family Guy&#x22;" will be
# converted to title=""Family Guy"" and this confuses BeautifulSoup.
if self.usingModule == 'beautifulsoup':
html_string = html_string.replace('""', '"')
#print html_string.encode('utf8')
if html_string:
dom = self.get_dom(html_string)
#print self.tostring(dom).encode('utf8')
try:
dom = self.preprocess_dom(dom)
except Exception, e:
self._logger.error('%s: caught exception preprocessing DOM',
self._cname, exc_info=True)
if self.getRefs:
try:
self.gather_refs(dom)
except Exception, e:
self._logger.warn('%s: unable to gather refs: %s',
self._cname, exc_info=True)
data = self.parse_dom(dom)
else:
data = {}
try:
data = self.postprocess_data(data)
except Exception, e:
self._logger.error('%s: caught exception postprocessing data',
self._cname, exc_info=True)
if self._containsObjects:
self.set_objects_params(data)
data = self.add_refs(data)
return data
def _build_empty_dom(self):
from bsouplxml import _bsoup
return _bsoup.BeautifulSoup('')
def get_dom(self, html_string):
"""Return a dom object, from the given string."""
try:
dom = self.fromstring(html_string)
if dom is None:
dom = self._build_empty_dom()
self._logger.error('%s: using a fake empty DOM', self._cname)
return dom
except Exception, e:
self._logger.error('%s: caught exception parsing DOM',
self._cname, exc_info=True)
return self._build_empty_dom()
def xpath(self, element, path):
"""Return elements matching the given XPath."""
try:
xpath_result = element.xpath(path)
if self._is_xml_unicode:
return xpath_result
result = []
for item in xpath_result:
if isinstance(item, str):
item = unicode(item)
result.append(item)
return result
except Exception, e:
self._logger.error('%s: caught exception extracting XPath "%s"',
self._cname, path, exc_info=True)
return []
def tostring(self, element):
"""Convert the element to a string."""
if isinstance(element, (unicode, str)):
return unicode(element)
else:
try:
return self._tostring(element, encoding=unicode)
except Exception, e:
self._logger.error('%s: unable to convert to string',
self._cname, exc_info=True)
return u''
def clone(self, element):
"""Clone an element."""
return self.fromstring(self.tostring(element))
def preprocess_string(self, html_string):
"""Here we can modify the text, before it's parsed."""
if not html_string:
return html_string
# Remove silly &nbsp;&raquo; chars.
html_string = html_string.replace(u' \xbb', u'')
try:
preprocessors = self.preprocessors
except AttributeError:
return html_string
for src, sub in preprocessors:
# re._pattern_type is present only since Python 2.5.
if callable(getattr(src, 'sub', None)):
html_string = src.sub(sub, html_string)
elif isinstance(src, str):
html_string = html_string.replace(src, sub)
elif callable(src):
try:
html_string = src(html_string)
except Exception, e:
_msg = '%s: caught exception preprocessing html'
self._logger.error(_msg, self._cname, exc_info=True)
continue
##print html_string.encode('utf8')
return html_string
def gather_refs(self, dom):
"""Collect references."""
grParser = GatherRefs(useModule=self._useModule)
grParser._as = self._as
grParser._modFunct = self._modFunct
refs = grParser.parse_dom(dom)
refs = grParser.postprocess_data(refs)
self._namesRefs = refs['names refs']
self._titlesRefs = refs['titles refs']
self._charactersRefs = refs['characters refs']
def preprocess_dom(self, dom):
"""Last chance to modify the dom, before the rules in self.extractors
are applied by the parse_dom method."""
return dom
def parse_dom(self, dom):
"""Parse the given dom according to the rules specified
in self.extractors."""
result = {}
for extractor in self.extractors:
##print extractor.label
if extractor.group is None:
elements = [(extractor.label, element)
for element in self.xpath(dom, extractor.path)]
else:
groups = self.xpath(dom, extractor.group)
elements = []
for group in groups:
group_key = self.xpath(group, extractor.group_key)
if not group_key: continue
group_key = group_key[0]
# XXX: always tries the conversion to unicode:
# BeautifulSoup.NavigableString is a subclass
# of unicode, and so it's never converted.
group_key = self.tostring(group_key)
normalizer = extractor.group_key_normalize
if normalizer is not None:
if callable(normalizer):
try:
group_key = normalizer(group_key)
except Exception, e:
_m = '%s: unable to apply group_key normalizer'
self._logger.error(_m, self._cname,
exc_info=True)
group_elements = self.xpath(group, extractor.path)
elements.extend([(group_key, element)
for element in group_elements])
for group_key, element in elements:
for attr in extractor.attrs:
if isinstance(attr.path, dict):
data = {}
for field in attr.path.keys():
path = attr.path[field]
value = self.xpath(element, path)
if not value:
data[field] = None
else:
# XXX: use u'' , to join?
data[field] = ''.join(value)
else:
data = self.xpath(element, attr.path)
if not data:
data = None
else:
data = attr.joiner.join(data)
if not data:
continue
attr_postprocess = attr.postprocess
if callable(attr_postprocess):
try:
data = attr_postprocess(data)
except Exception, e:
_m = '%s: unable to apply attr postprocess'
self._logger.error(_m, self._cname, exc_info=True)
key = attr.key
if key is None:
key = group_key
elif key.startswith('.'):
# assuming this is an xpath
try:
key = self.xpath(element, key)[0]
except IndexError:
self._logger.error('%s: XPath returned no items',
self._cname, exc_info=True)
elif key.startswith('self.'):
key = getattr(self, key[5:])
if attr.multi:
if key not in result:
result[key] = []
result[key].append(data)
else:
if isinstance(data, dict):
result.update(data)
else:
result[key] = data
return result
def postprocess_data(self, data):
"""Here we can modify the data."""
return data
def set_objects_params(self, data):
"""Set parameters of Movie/Person/... instances, since they are
not always set in the parser's code."""
for obj in flatten(data, yieldDictKeys=True, scalar=_Container):
obj.accessSystem = self._as
obj.modFunct = self._modFunct
def add_refs(self, data):
"""Modify data according to the expected output."""
if self.getRefs:
titl_re = ur'(%s)' % '|'.join([re.escape(x) for x
in self._titlesRefs.keys()])
if titl_re != ur'()': re_titles = re.compile(titl_re, re.U)
else: re_titles = None
nam_re = ur'(%s)' % '|'.join([re.escape(x) for x
in self._namesRefs.keys()])
if nam_re != ur'()': re_names = re.compile(nam_re, re.U)
else: re_names = None
chr_re = ur'(%s)' % '|'.join([re.escape(x) for x
in self._charactersRefs.keys()])
if chr_re != ur'()': re_characters = re.compile(chr_re, re.U)
else: re_characters = None
_putRefs(data, re_titles, re_names, re_characters)
return {'data': data, 'titlesRefs': self._titlesRefs,
'namesRefs': self._namesRefs,
'charactersRefs': self._charactersRefs}
class Extractor(object):
"""Instruct the DOM parser about how to parse a document."""
def __init__(self, label, path, attrs, group=None, group_key=None,
group_key_normalize=None):
"""Initialize an Extractor object, used to instruct the DOM parser
about how to parse a document."""
# rarely (never?) used, mostly for debugging purposes.
self.label = label
self.group = group
if group_key is None:
self.group_key = ".//text()"
else:
self.group_key = group_key
self.group_key_normalize = group_key_normalize
self.path = path
# A list of attributes to fetch.
if isinstance(attrs, Attribute):
attrs = [attrs]
self.attrs = attrs
def __repr__(self):
"""String representation of an Extractor object."""
r = '<Extractor id:%s (label=%s, path=%s, attrs=%s, group=%s, ' \
'group_key=%s group_key_normalize=%s)>' % (id(self),
self.label, self.path, repr(self.attrs), self.group,
self.group_key, self.group_key_normalize)
return r
class Attribute(object):
"""The attribute to consider, for a given node."""
def __init__(self, key, multi=False, path=None, joiner=None,
postprocess=None):
"""Initialize an Attribute object, used to specify the
attribute to consider, for a given node."""
# The key under which information will be saved; can be a string or an
# XPath. If None, the label of the containing extractor will be used.
self.key = key
self.multi = multi
self.path = path
if joiner is None:
joiner = ''
self.joiner = joiner
# Post-process this set of information.
self.postprocess = postprocess
def __repr__(self):
"""String representation of an Attribute object."""
r = '<Attribute id:%s (key=%s, multi=%s, path=%s, joiner=%s, ' \
'postprocess=%s)>' % (id(self), self.key,
self.multi, repr(self.path),
self.joiner, repr(self.postprocess))
return r
def _parse_ref(text, link, info):
"""Manage links to references."""
if link.find('/title/tt') != -1:
yearK = re_yearKind_index.match(info)
if yearK and yearK.start() == 0:
text += ' %s' % info[:yearK.end()]
return (text.replace('\n', ' '), link)
class GatherRefs(DOMParserBase):
"""Parser used to gather references to movies, persons and characters."""
_attrs = [Attribute(key=None, multi=True,
path={
'text': './text()',
'link': './@href',
'info': './following::text()[1]'
},
postprocess=lambda x: _parse_ref(x.get('text'), x.get('link'),
(x.get('info') or u'').strip()))]
extractors = [
Extractor(label='names refs',
path="//a[starts-with(@href, '/name/nm')][string-length(@href)=16]",
attrs=_attrs),
Extractor(label='titles refs',
path="//a[starts-with(@href, '/title/tt')]" \
"[string-length(@href)=17]",
attrs=_attrs),
Extractor(label='characters refs',
path="//a[starts-with(@href, '/character/ch')]" \
"[string-length(@href)=21]",
attrs=_attrs),
]
def postprocess_data(self, data):
result = {}
for item in ('names refs', 'titles refs', 'characters refs'):
result[item] = {}
for k, v in data.get(item, []):
if not v.endswith('/'): continue
imdbID = analyze_imdbid(v)
if item == 'names refs':
obj = Person(personID=imdbID, name=k,
accessSystem=self._as, modFunct=self._modFunct)
elif item == 'titles refs':
obj = Movie(movieID=imdbID, title=k,
accessSystem=self._as, modFunct=self._modFunct)
else:
obj = Character(characterID=imdbID, name=k,
accessSystem=self._as, modFunct=self._modFunct)
# XXX: companies aren't handled: are they ever found in text,
# as links to their page?
result[item][k] = obj
return result
def add_refs(self, data):
return data
-833
View File
@@ -1,833 +0,0 @@
"""
parser.mobile package (imdb package).
This package provides the IMDbMobileAccessSystem class used to access
IMDb's data for mobile systems.
the imdb.IMDb function will return an instance of this class when
called with the 'accessSystem' argument set to "mobile".
Copyright 2005-2010 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
import re
import logging
from urllib import unquote
from imdb import imdbURL_movie_main, imdbURL_person_main, imdbURL_character_main
from imdb.Movie import Movie
from imdb.utils import analyze_title, analyze_name, canonicalName, \
date_and_notes
from imdb._exceptions import IMDbDataAccessError
from imdb.parser.http import IMDbHTTPAccessSystem
from imdb.parser.http.utils import subXMLRefs, subSGMLRefs, build_person, \
build_movie, re_spaces
# XXX NOTE: the first version of this module was heavily based on
# regular expressions. This new version replace regexps with
# find() strings' method calls; despite being less flexible, it
# seems to be at least as fast and, hopefully, much more
# lightweight. Yes: the regexp-based version was too heavyweight
# for systems with very limited CPU power and memory footprint.
re_spacessub = re_spaces.sub
# Strip html.
re_unhtml = re.compile(r'<.+?>')
re_unhtmlsub = re_unhtml.sub
# imdb person or movie ids.
re_imdbID = re.compile(r'(?<=nm|tt|ch)([0-9]{7})\b')
# movie AKAs.
re_makas = re.compile('(<p class="find-aka">.*?</p>)')
# Remove episode numbers.
re_filmo_episodes = re.compile('<div class="filmo-episodes">.*?</div>',
re.M | re.I)
def _unHtml(s):
"""Return a string without tags and no multiple spaces."""
return subSGMLRefs(re_spacessub(' ', re_unhtmlsub('', s)).strip())
_inttype = type(0)
def _getTagsWith(s, cont, toClosure=False, maxRes=None):
"""Return the html tags in the 's' string containing the 'cont'
string; if toClosure is True, everything between the opening
tag and the closing tag is returned."""
lres = []
bi = s.find(cont)
if bi != -1:
btag = s[:bi].rfind('<')
if btag != -1:
if not toClosure:
etag = s[bi+1:].find('>')
if etag != -1:
endidx = bi+2+etag
lres.append(s[btag:endidx])
if maxRes is not None and len(lres) >= maxRes: return lres
lres += _getTagsWith(s[endidx:], cont,
toClosure=toClosure)
else:
spaceidx = s[btag:].find(' ')
if spaceidx != -1:
ctag = '</%s>' % s[btag+1:btag+spaceidx]
closeidx = s[bi:].find(ctag)
if closeidx != -1:
endidx = bi+closeidx+len(ctag)
lres.append(s[btag:endidx])
if maxRes is not None and len(lres) >= maxRes:
return lres
lres += _getTagsWith(s[endidx:], cont,
toClosure=toClosure)
return lres
def _findBetween(s, begins, ends, beginindx=0, maxRes=None, lres=None):
"""Return the list of strings from the 's' string which are included
between the 'begins' and 'ends' strings."""
if lres is None:
lres = []
bi = s.find(begins, beginindx)
if bi != -1:
lbegins = len(begins)
if isinstance(ends, (list, tuple)):
eset = [s.find(end, bi+lbegins) for end in ends]
eset[:] = [x for x in eset if x != -1]
if not eset: ei = -1
else: ei = min(eset)
else:
ei = s.find(ends, bi+lbegins)
if ei != -1:
match = s[bi+lbegins:ei]
lres.append(match)
if maxRes is not None and len(lres) >= maxRes: return lres
_findBetween(s, begins, ends, beginindx=ei, maxRes=maxRes,
lres=lres)
return lres
class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
"""The class used to access IMDb's data through the web for
mobile terminals."""
accessSystem = 'mobile'
_mobile_logger = logging.getLogger('imdbpy.parser.mobile')
def __init__(self, isThin=1, *arguments, **keywords):
self.accessSystem = 'mobile'
IMDbHTTPAccessSystem.__init__(self, isThin, *arguments, **keywords)
def _clean_html(self, html):
"""Normalize the retrieve html."""
html = re_spaces.sub(' ', html)
# Remove silly &nbsp;&raquo; chars.
html = html.replace('&nbsp;&raquo;', '')
return subXMLRefs(html)
def _mretrieve(self, url, size=-1):
"""Retrieve an html page and normalize it."""
cont = self._retrieve(url, size=size)
return self._clean_html(cont)
def _getPersons(self, s, sep='<br/>'):
"""Return a list of Person objects, from the string s; items
are assumed to be separated by the sep string."""
names = s.split(sep)
pl = []
plappend = pl.append
counter = 1
for name in names:
pid = re_imdbID.findall(name)
if not pid: continue
characters = _getTagsWith(name, 'class="char"',
toClosure=True, maxRes=1)
chpids = []
if characters:
for ch in characters[0].split(' / '):
chid = re_imdbID.findall(ch)
if not chid:
chpids.append(None)
else:
chpids.append(chid[-1])
if not chpids:
chpids = None
elif len(chpids) == 1:
chpids = chpids[0]
name = _unHtml(name)
# Catch unclosed tags.
gt_indx = name.find('>')
if gt_indx != -1:
name = name[gt_indx+1:].lstrip()
if not name: continue
if name.endswith('...'):
name = name[:-3]
p = build_person(name, personID=str(pid[0]), billingPos=counter,
modFunct=self._defModFunct, roleID=chpids,
accessSystem=self.accessSystem)
plappend(p)
counter += 1
return pl
def _search_movie(self, title, results):
##params = urllib.urlencode({'tt': 'on','mx': str(results),'q': title})
##params = 'q=%s&tt=on&mx=%s' % (urllib.quote_plus(title), str(results))
##cont = self._mretrieve(imdbURL_search % params)
cont = subXMLRefs(self._get_search_content('tt', title, results))
title = _findBetween(cont, '<title>', '</title>', maxRes=1)
res = []
if not title:
self._mobile_logger.error('no title tag searching for movie %s',
title)
return res
tl = title[0].lower()
if not tl.startswith('imdb title'):
# a direct hit!
title = _unHtml(title[0])
mid = None
midtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1)
if midtag:
mid = _findBetween(midtag[0], '/title/tt', '/', maxRes=1)
if not (mid and title):
self._mobile_logger.error('no direct hit title/movieID for' \
' title %s', title)
return res
if cont.find('<span class="tv-extra">TV mini-series</span>') != -1:
title += ' (mini)'
res[:] = [(str(mid[0]), analyze_title(title))]
else:
# XXX: this results*3 prevents some recursion errors, but...
# it's not exactly understandable (i.e.: why 'results' is
# not enough to get all the results?)
lis = _findBetween(cont, 'td valign="top">', '</td>',
maxRes=results*3)
for li in lis:
akas = re_makas.findall(li)
for idx, aka in enumerate(akas):
aka = aka.replace('" - ', '::', 1)
aka = _unHtml(aka)
if aka.startswith('aka "'):
aka = aka[5:].strip()
if aka[-1] == '"':
aka = aka[:-1]
akas[idx] = aka
imdbid = re_imdbID.findall(li)
li = re_makas.sub('', li)
mtitle = _unHtml(li)
if not (imdbid and mtitle):
self._mobile_logger.debug('no title/movieID parsing' \
' %s searching for title %s', li,
title)
continue
mtitle = mtitle.replace('(TV mini-series)', '(mini)')
resd = analyze_title(mtitle)
if akas:
resd['akas'] = akas
res.append((str(imdbid[0]), resd))
return res
def get_movie_main(self, movieID):
cont = self._mretrieve(imdbURL_movie_main % movieID + 'maindetails')
title = _findBetween(cont, '<title>', '</title>', maxRes=1)
if not title:
raise IMDbDataAccessError, 'unable to get movieID "%s"' % movieID
title = _unHtml(title[0])
if cont.find('<span class="tv-extra">TV mini-series</span>') != -1:
title += ' (mini)'
d = analyze_title(title)
kind = d.get('kind')
tv_series = _findBetween(cont, 'TV Series:</h5>', '</a>', maxRes=1)
if tv_series: mid = re_imdbID.findall(tv_series[0])
else: mid = None
if tv_series and mid:
s_title = _unHtml(tv_series[0])
s_data = analyze_title(s_title)
m = Movie(movieID=str(mid[0]), data=s_data,
accessSystem=self.accessSystem,
modFunct=self._defModFunct)
d['kind'] = kind = u'episode'
d['episode of'] = m
if kind in ('tv series', 'tv mini series'):
years = _findBetween(cont, '<h1>', '</h1>', maxRes=1)
if years:
years[:] = _findBetween(years[0], 'TV series', '</span>',
maxRes=1)
if years:
d['series years'] = years[0].strip()
air_date = _findBetween(cont, 'Original Air Date:</h5>', '</div>',
maxRes=1)
if air_date:
air_date = air_date[0]
vi = air_date.find('(')
if vi != -1:
date = _unHtml(air_date[:vi]).strip()
if date != '????':
d['original air date'] = date
air_date = air_date[vi:]
season = _findBetween(air_date, 'Season', ',', maxRes=1)
if season:
season = season[0].strip()
try: season = int(season)
except: pass
if season or type(season) is _inttype:
d['season'] = season
episode = _findBetween(air_date, 'Episode', ')', maxRes=1)
if episode:
episode = episode[0].strip()
try: episode = int(episode)
except: pass
if episode or type(season) is _inttype:
d['episode'] = episode
direct = _findBetween(cont, '<h5>Director', ('</div>', '<br/> <br/>'),
maxRes=1)
if direct:
direct = direct[0]
h5idx = direct.find('/h5>')
if h5idx != -1:
direct = direct[h5idx+4:]
direct = self._getPersons(direct)
if direct: d['director'] = direct
if kind in ('tv series', 'tv mini series', 'episode'):
if kind != 'episode':
seasons = _findBetween(cont, 'Seasons:</h5>', '</div>',
maxRes=1)
if seasons:
d['number of seasons'] = seasons[0].count('|') + 1
creator = _findBetween(cont, 'Created by</h5>', ('class="tn15more"',
'</div>',
'<br/> <br/>'),
maxRes=1)
if not creator:
# They change 'Created by' to 'Creator' and viceversa
# from time to time...
# XXX: is 'Creators' also used?
creator = _findBetween(cont, 'Creator:</h5>',
('class="tn15more"', '</div>',
'<br/> <br/>'), maxRes=1)
if creator:
creator = creator[0]
if creator.find('tn15more'): creator = '%s>' % creator
creator = self._getPersons(creator)
if creator: d['creator'] = creator
writers = _findBetween(cont, '<h5>Writer', ('</div>', '<br/> <br/>'),
maxRes=1)
if writers:
writers = writers[0]
h5idx = writers.find('/h5>')
if h5idx != -1:
writers = writers[h5idx+4:]
writers = self._getPersons(writers)
if writers: d['writer'] = writers
cvurl = _getTagsWith(cont, 'name="poster"', toClosure=True, maxRes=1)
if cvurl:
cvurl = _findBetween(cvurl[0], 'src="', '"', maxRes=1)
if cvurl: d['cover url'] = cvurl[0]
genres = _findBetween(cont, 'href="/Sections/Genres/', '/')
if genres:
d['genres'] = list(set(genres))
ur = _findBetween(cont, '<div class="starbar-meta">', '</div>',
maxRes=1)
if ur:
rat = _findBetween(ur[0], '<b>', '</b>', maxRes=1)
if rat:
teni = rat[0].find('/10')
if teni != -1:
rat = rat[0][:teni]
try:
rat = float(rat.strip())
d['rating'] = rat
except ValueError:
self._mobile_logger.warn('wrong rating: %s', rat)
vi = ur[0].rfind('tn15more">')
if vi != -1 and ur[0][vi+10:].find('await') == -1:
try:
votes = _unHtml(ur[0][vi+10:]).replace('votes', '').strip()
votes = int(votes.replace(',', ''))
d['votes'] = votes
except ValueError:
self._mobile_logger.warn('wrong votes: %s', ur)
top250 = _findBetween(cont, 'href="/chart/top?', '</a>', maxRes=1)
if top250:
fn = top250[0].rfind('#')
if fn != -1:
try:
td = int(top250[0][fn+1:])
d['top 250 rank'] = td
except ValueError:
self._mobile_logger.warn('wrong top250: %s', top250)
castdata = _findBetween(cont, 'Cast overview', '</table>', maxRes=1)
if not castdata:
castdata = _findBetween(cont, 'Credited cast', '</table>', maxRes=1)
if not castdata:
castdata = _findBetween(cont, 'Complete credited cast', '</table>',
maxRes=1)
if not castdata:
castdata = _findBetween(cont, 'Series Cast Summary', '</table>',
maxRes=1)
if not castdata:
castdata = _findBetween(cont, 'Episode Credited cast', '</table>',
maxRes=1)
if castdata:
castdata = castdata[0]
# Reintegrate the fist tag.
fl = castdata.find('href=')
if fl != -1: castdata = '<a ' + castdata[fl:]
# Exclude the 'rest of cast listed alphabetically' row.
smib = castdata.find('<tr><td align="center" colspan="4"><small>')
if smib != -1:
smie = castdata.rfind('</small></td></tr>')
if smie != -1:
castdata = castdata[:smib].strip() + \
castdata[smie+18:].strip()
castdata = castdata.replace('/tr> <tr', '/tr><tr')
cast = self._getPersons(castdata, sep='</tr><tr')
if cast: d['cast'] = cast
akas = _findBetween(cont, 'Also Known As:</h5>', '</div>', maxRes=1)
if akas:
# For some reason, here <br> is still used in place of <br/>.
akas[:] = [x for x in akas[0].split('<br>') if x.strip()]
akas = [_unHtml(x).replace('" - ','::', 1).lstrip('"').strip()
for x in akas]
if 'See more' in akas: akas.remove('See more')
akas[:] = [x for x in akas if x]
if akas:
d['akas'] = akas
mpaa = _findBetween(cont, 'MPAA</a>:', '</div>', maxRes=1)
if mpaa: d['mpaa'] = _unHtml(mpaa[0])
runtimes = _findBetween(cont, 'Runtime:</h5>', '</div>', maxRes=1)
if runtimes:
runtimes = runtimes[0]
runtimes = [x.strip().replace(' min', '').replace(' (', '::(', 1)
for x in runtimes.split('|')]
d['runtimes'] = [_unHtml(x).strip() for x in runtimes]
if kind == 'episode':
# number of episodes.
epsn = _findBetween(cont, 'title="Full Episode List">', '</a>',
maxRes=1)
if epsn:
epsn = epsn[0].replace(' Episodes', '').strip()
if epsn:
try:
epsn = int(epsn)
except:
self._mobile_logger.warn('wrong episodes #: %s', epsn)
d['number of episodes'] = epsn
country = _findBetween(cont, 'Country:</h5>', '</div>', maxRes=1)
if country:
country[:] = country[0].split(' | ')
country[:] = ['<a %s' % x for x in country if x]
country[:] = [_unHtml(x.replace(' <i>', '::')) for x in country]
if country: d['countries'] = country
lang = _findBetween(cont, 'Language:</h5>', '</div>', maxRes=1)
if lang:
lang[:] = lang[0].split(' | ')
lang[:] = ['<a %s' % x for x in lang if x]
lang[:] = [_unHtml(x.replace(' <i>', '::')) for x in lang]
if lang: d['languages'] = lang
col = _findBetween(cont, '"/search/title?colors=', '</div>')
if col:
col[:] = col[0].split(' | ')
col[:] = ['<a %s' % x for x in col if x]
col[:] = [_unHtml(x.replace(' <i>', '::')) for x in col]
if col: d['color info'] = col
sm = _findBetween(cont, '/search/title?sound_mixes=', '</div>',
maxRes=1)
if sm:
sm[:] = sm[0].split(' | ')
sm[:] = ['<a %s' % x for x in sm if x]
sm[:] = [_unHtml(x.replace(' <i>', '::')) for x in sm]
if sm: d['sound mix'] = sm
cert = _findBetween(cont, 'Certification:</h5>', '</div>', maxRes=1)
if cert:
cert[:] = cert[0].split(' | ')
cert[:] = [_unHtml(x.replace(' <i>', '::')) for x in cert]
if cert: d['certificates'] = cert
plotoutline = _findBetween(cont, 'Plot:</h5>', ['<a ', '</div>'],
maxRes=1)
if plotoutline:
plotoutline = plotoutline[0].strip()
plotoutline = plotoutline.rstrip('|').rstrip()
if plotoutline: d['plot outline'] = _unHtml(plotoutline)
aratio = _findBetween(cont, 'Aspect Ratio:</h5>', ['<a ', '</div>'],
maxRes=1)
if aratio:
aratio = aratio[0].strip().replace(' (', '::(', 1)
if aratio:
d['aspect ratio'] = _unHtml(aratio)
return {'data': d}
def get_movie_plot(self, movieID):
cont = self._mretrieve(imdbURL_movie_main % movieID + 'plotsummary')
plot = _findBetween(cont, '<p class="plotpar">', '</p>')
plot[:] = [_unHtml(x) for x in plot]
for i in xrange(len(plot)):
p = plot[i]
wbyidx = p.rfind(' Written by ')
if wbyidx != -1:
plot[i] = '%s::%s' % \
(p[:wbyidx].rstrip(),
p[wbyidx+12:].rstrip().replace('{','<').replace('}','>'))
if plot: return {'data': {'plot': plot}}
return {'data': {}}
def _search_person(self, name, results):
##params = urllib.urlencode({'nm': 'on', 'mx': str(results), 'q': name})
##params = 'q=%s&nm=on&mx=%s' % (urllib.quote_plus(name), str(results))
##cont = self._mretrieve(imdbURL_search % params)
cont = subXMLRefs(self._get_search_content('nm', name, results))
name = _findBetween(cont, '<title>', '</title>', maxRes=1)
res = []
if not name:
self._mobile_logger.warn('no title tag searching for name %s', name)
return res
nl = name[0].lower()
if not nl.startswith('imdb name'):
# a direct hit!
name = _unHtml(name[0])
name = name.replace('- Filmography by type' , '').strip()
pid = None
pidtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1)
if pidtag:
pid = _findBetween(pidtag[0], '/name/nm', '/', maxRes=1)
if not (pid and name):
self._mobile_logger.error('no direct hit name/personID for' \
' name %s', name)
return res
res[:] = [(str(pid[0]), analyze_name(name, canonical=1))]
else:
lis = _findBetween(cont, 'td valign="top">', '</td>',
maxRes=results*3)
for li in lis:
akas = _findBetween(li, '<em>"', '"</em>')
for sep in ['<small', '<br> aka', '<br> birth name']:
sepIdx = li.find(sep)
if sepIdx != -1:
li = li[:sepIdx]
pid = re_imdbID.findall(li)
pname = _unHtml(li)
if not (pid and pname):
self._mobile_logger.debug('no name/personID parsing' \
' %s searching for name %s', li,
name)
continue
resd = analyze_name(pname, canonical=1)
if akas:
resd['akas'] = akas
res.append((str(pid[0]), resd))
return res
def get_person_main(self, personID, _parseChr=False):
if not _parseChr:
url = imdbURL_person_main % personID + 'maindetails'
else:
url = imdbURL_character_main % personID
s = self._mretrieve(url)
r = {}
name = _findBetween(s, '<title>', '</title>', maxRes=1)
if not name:
if _parseChr: w = 'characterID'
else: w = 'personID'
raise IMDbDataAccessError, 'unable to get %s "%s"' % (w, personID)
name = _unHtml(name[0].replace(' - IMDb', ''))
if _parseChr:
name = name.replace('(Character)', '').strip()
name = name.replace('- Filmography by type', '').strip()
else:
name = name.replace('- Filmography by', '').strip()
r = analyze_name(name, canonical=not _parseChr)
for dKind in ('Born', 'Died'):
date = _findBetween(s, '%s:</h4>' % dKind.capitalize(),
('<div class', '</div>', '<br/><br/>'), maxRes=1)
if date:
date = _unHtml(date[0])
if date:
#date, notes = date_and_notes(date)
# TODO: fix to handle real names.
date_notes = date.split(' in ', 1)
notes = u''
date = date_notes[0]
if len(date_notes) == 2:
notes = date_notes[1]
dtitle = 'birth'
if dKind == 'Died':
dtitle = 'death'
if date:
r['%s date' % dtitle] = date
if notes:
r['%s notes' % dtitle] = notes
akas = _findBetween(s, 'Alternate Names:</h5>', ('</div>',
'<br/><br/>'), maxRes=1)
if akas:
akas = akas[0]
if akas.find(' | ') != -1:
akas = _unHtml(akas).split(' | ')
else:
akas = _unHtml(akas).split(' / ')
if akas: r['akas'] = akas
hs = _findBetween(s, 'name="headshot"', '</a>', maxRes=1)
if hs:
hs[:] = _findBetween(hs[0], 'src="', '"', maxRes=1)
if hs: r['headshot'] = hs[0]
# Build a list of tuples such [('hrefLink', 'section name')]
workkind = _findBetween(s, 'id="jumpto_', '</a>')
ws = []
for work in workkind:
sep = '" >'
if '">' in work:
sep = '">'
wsplit = work.split(sep, 1)
if len(wsplit) == 2:
sect = wsplit[0]
if '"' in sect:
sect = sect[:sect.find('"')]
ws.append((sect, wsplit[1].lower()))
# XXX: I think "guest appearances" are gone.
if s.find('<a href="#guest-appearances"') != -1:
ws.append(('guest-appearances', 'notable tv guest appearances'))
#if _parseChr:
# ws.append(('filmography', 'filmography'))
for sect, sectName in ws:
raws = u''
# Everything between the current section link and the end
# of the <ol> tag.
if _parseChr and sect == 'filmography':
inisect = s.find('<div class="filmo">')
else:
inisect = s.find('<a name="%s' % sect)
if inisect != -1:
endsect = s[inisect:].find('<div id="filmo-head-')
if endsect != -1: raws = s[inisect:inisect+endsect]
if not raws: continue
mlist = _findBetween(raws, '<div class="filmo-row',
('<div class="clear"/>',))
for m in mlist:
fCB = m.find('>')
if fCB != -1:
m = m[fCB+1:].lstrip()
m = re_filmo_episodes.sub('', m)
# For every movie in the current section.
movieID = re_imdbID.findall(m)
if not movieID:
self._mobile_logger.debug('no movieID in %s', m)
continue
m = m.replace('<br/>', ' .... ', 1)
if not _parseChr:
chrIndx = m.find(' .... ')
else:
chrIndx = m.find(' Played by ')
chids = []
if chrIndx != -1:
chrtxt = m[chrIndx+6:]
if _parseChr:
chrtxt = chrtxt[5:]
for ch in chrtxt.split(' / '):
chid = re_imdbID.findall(ch)
if not chid:
chids.append(None)
else:
chids.append(chid[-1])
if not chids:
chids = None
elif len(chids) == 1:
chids = chids[0]
movieID = str(movieID[0])
# Search the status.
stidx = m.find('<i>')
status = u''
if stidx != -1:
stendidx = m.rfind('</i>')
if stendidx != -1:
status = _unHtml(m[stidx+3:stendidx])
m = m.replace(m[stidx+3:stendidx], '')
year = _findBetween(m, 'year_column">', '</span>', maxRes=1)
if year:
year = year[0]
m = m.replace('<span class="year_column">%s</span>' % year,
'')
else:
year = None
m = _unHtml(m)
if not m:
self._mobile_logger.warn('no title for movieID %s', movieID)
continue
movie = build_movie(m, movieID=movieID, status=status,
roleID=chids, modFunct=self._defModFunct,
accessSystem=self.accessSystem,
_parsingCharacter=_parseChr, year=year)
sectName = sectName.split(':')[0]
r.setdefault(sectName, []).append(movie)
# If available, take the always correct name from a form.
itag = _getTagsWith(s, 'NAME="primary"', maxRes=1)
if not itag:
itag = _getTagsWith(s, 'name="primary"', maxRes=1)
if itag:
vtag = _findBetween(itag[0], 'VALUE="', ('"', '>'), maxRes=1)
if not vtag:
vtag = _findBetween(itag[0], 'value="', ('"', '>'), maxRes=1)
if vtag:
try:
vtag = unquote(str(vtag[0]))
vtag = unicode(vtag, 'latin_1')
r.update(analyze_name(vtag))
except UnicodeEncodeError:
pass
return {'data': r, 'info sets': ('main', 'filmography')}
def get_person_biography(self, personID):
cont = self._mretrieve(imdbURL_person_main % personID + 'bio')
d = {}
spouses = _findBetween(cont, 'Spouse</h5>', ('</table>', '</dd>'),
maxRes=1)
if spouses:
sl = []
for spouse in spouses[0].split('</tr>'):
if spouse.count('</td>') > 1:
spouse = spouse.replace('</td>', '::</td>', 1)
spouse = _unHtml(spouse)
spouse = spouse.replace(':: ', '::').strip()
if spouse: sl.append(spouse)
if sl: d['spouse'] = sl
nnames = _findBetween(cont, '<h5>Nickname</h5>', ('<br/> <br/>','<h5>'),
maxRes=1)
if nnames:
nnames = nnames[0]
if nnames:
nnames = [x.strip().replace(' (', '::(', 1)
for x in nnames.split('<br/>')]
if nnames:
d['nick names'] = nnames
misc_sects = _findBetween(cont, '<h5>', '<br/>')
misc_sects[:] = [x.split('</h5>') for x in misc_sects]
misc_sects[:] = [x for x in misc_sects if len(x) == 2]
for sect, data in misc_sects:
sect = sect.lower().replace(':', '').strip()
if d.has_key(sect) and sect != 'mini biography': continue
elif sect in ('spouse', 'nickname'): continue
if sect == 'salary': sect = 'salary history'
elif sect == 'where are they now': sect = 'where now'
elif sect == 'personal quotes': sect = 'quotes'
data = data.replace('</p><p>', '::')
data = data.replace('<br><br>', ' ') # for multi-paragraphs 'bio'
data = data.replace('</td> <td valign="top">', '@@@@')
data = data.replace('</td> </tr>', '::')
data = _unHtml(data)
data = [x.strip() for x in data.split('::')]
data[:] = [x.replace('@@@@', '::') for x in data if x]
if sect == 'height' and data: data = data[0]
elif sect == 'birth name': data = canonicalName(data[0])
elif sect == 'date of birth':
date, notes = date_and_notes(data[0])
if date:
d['birth date'] = date
if notes:
d['birth notes'] = notes
continue
elif sect == 'date of death':
date, notes = date_and_notes(data[0])
if date:
d['death date'] = date
if notes:
d['death notes'] = notes
continue
elif sect == 'mini biography':
ndata = []
for bio in data:
byidx = bio.rfind('IMDb Mini Biography By')
if byidx != -1:
bioAuth = bio[:byidx].rstrip()
else:
bioAuth = 'Anonymous'
bio = u'%s::%s' % (bioAuth, bio[byidx+23:].lstrip())
ndata.append(bio)
data[:] = ndata
if 'mini biography' in d:
d['mini biography'].append(ndata[0])
continue
d[sect] = data
return {'data': d}
def _search_character(self, name, results):
cont = subXMLRefs(self._get_search_content('char', name, results))
name = _findBetween(cont, '<title>', '</title>', maxRes=1)
res = []
if not name:
self._mobile_logger.error('no title tag searching character %s',
name)
return res
nl = name[0].lower()
if not (nl.startswith('imdb search') or nl.startswith('imdb search') \
or nl.startswith('imdb character')):
# a direct hit!
name = _unHtml(name[0]).replace('(Character)', '').strip()
pid = None
pidtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1)
if pidtag:
pid = _findBetween(pidtag[0], '/character/ch', '/', maxRes=1)
if not (pid and name):
self._mobile_logger.error('no direct hit name/characterID for' \
' character %s', name)
return res
res[:] = [(str(pid[0]), analyze_name(name))]
else:
sects = _findBetween(cont, '<b>Popular Characters</b>', '</table>',
maxRes=results*3)
sects += _findBetween(cont, '<b>Characters', '</table>',
maxRes=results*3)
for sect in sects:
lis = _findBetween(sect, '<a href="/character/',
['<small', '</td>', '<br'])
for li in lis:
li = '<%s' % li
pid = re_imdbID.findall(li)
pname = _unHtml(li)
if not (pid and pname):
self._mobile_logger.debug('no name/characterID' \
' parsing %s searching for' \
' character %s', li, name)
continue
res.append((str(pid[0]), analyze_name(pname)))
return res
def get_character_main(self, characterID):
return self.get_person_main(characterID, _parseChr=True)
def get_character_biography(self, characterID):
cont = self._mretrieve(imdbURL_character_main % characterID + 'bio')
d = {}
intro = _findBetween(cont, '<div class="display">',
('<span>', '<h4>'), maxRes=1)
if intro:
intro = _unHtml(intro[0]).strip()
if intro:
d['introduction'] = intro
bios = _findBetween(cont, '<div class="display">',
'<div class="history">')
if bios:
bios = _findBetween(bios[0], '<h4>', ('<h4>', '</div>'))
if bios:
for bio in bios:
bio = bio.replace('</h4>', '::')
bio = bio.replace('\n', ' ')
bio = bio.replace('<br>', '\n')
bio = bio.replace('<br/>', '\n')
bio = subSGMLRefs(re_unhtmlsub('', bio).strip())
bio = bio.replace(' ::', '::').replace(':: ', '::')
bio = bio.replace('::', ': ', 1)
if bio:
d.setdefault('biography', []).append(bio)
return {'data': d}
File diff suppressed because it is too large Load Diff
-508
View File
@@ -1,508 +0,0 @@
"""
parser.sql.alchemyadapter module (imdb.parser.sql package).
This module adapts the SQLAlchemy ORM to the internal mechanism.
Copyright 2008-2010 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
import re
import sys
import logging
from sqlalchemy import *
from sqlalchemy import schema
try: from sqlalchemy import exc # 0.5
except ImportError: from sqlalchemy import exceptions as exc # 0.4
_alchemy_logger = logging.getLogger('imdbpy.parser.sql.alchemy')
try:
import migrate.changeset
HAS_MC = True
except ImportError:
HAS_MC = False
_alchemy_logger.warn('Unable to import migrate.changeset: Foreign ' \
'Keys will not be created.')
from imdb._exceptions import IMDbDataAccessError
from dbschema import *
# Used to convert table and column names.
re_upper = re.compile(r'([A-Z])')
# XXX: I'm not sure at all that this is the best method to connect
# to the database and bind that connection to every table.
metadata = MetaData()
# Maps our placeholders to SQLAlchemy's column types.
MAP_COLS = {
INTCOL: Integer,
UNICODECOL: UnicodeText,
STRINGCOL: String
}
class NotFoundError(IMDbDataAccessError):
"""Exception raised when Table.get(id) returns no value."""
pass
def _renameTable(tname):
"""Build the name of a table, as done by SQLObject."""
tname = re_upper.sub(r'_\1', tname)
if tname.startswith('_'):
tname = tname[1:]
return tname.lower()
def _renameColumn(cname):
"""Build the name of a column, as done by SQLObject."""
cname = cname.replace('ID', 'Id')
return _renameTable(cname)
class DNNameObj(object):
"""Used to access table.sqlmeta.columns[column].dbName (a string)."""
def __init__(self, dbName):
self.dbName = dbName
def __repr__(self):
return '<DNNameObj(dbName=%s) [id=%s]>' % (self.dbName, id(self))
class DNNameDict(object):
"""Used to access table.sqlmeta.columns (a dictionary)."""
def __init__(self, colMap):
self.colMap = colMap
def __getitem__(self, key):
return DNNameObj(self.colMap[key])
def __repr__(self):
return '<DNNameDict(colMap=%s) [id=%s]>' % (self.colMap, id(self))
class SQLMetaAdapter(object):
"""Used to access table.sqlmeta (an object with .table, .columns and
.idName attributes)."""
def __init__(self, table, colMap=None):
self.table = table
if colMap is None:
colMap = {}
self.colMap = colMap
def __getattr__(self, name):
if name == 'table':
return getattr(self.table, name)
if name == 'columns':
return DNNameDict(self.colMap)
if name == 'idName':
return self.colMap.get('id', 'id')
return None
def __repr__(self):
return '<SQLMetaAdapter(table=%s, colMap=%s) [id=%s]>' % \
(repr(self.table), repr(self.colMap), id(self))
class QAdapter(object):
"""Used to access table.q attribute (remapped to SQLAlchemy table.c)."""
def __init__(self, table, colMap=None):
self.table = table
if colMap is None:
colMap = {}
self.colMap = colMap
def __getattr__(self, name):
try: return getattr(self.table.c, self.colMap[name])
except KeyError, e: raise AttributeError, "unable to get '%s'" % name
def __repr__(self):
return '<QAdapter(table=%s, colMap=%s) [id=%s]>' % \
(repr(self.table), repr(self.colMap), id(self))
class RowAdapter(object):
"""Adapter for a SQLAlchemy RowProxy object."""
def __init__(self, row, table, colMap=None):
self.row = row
# FIXME: it's OBSCENE that 'table' should be passed from
# TableAdapter through ResultAdapter only to land here,
# where it's used to directly update a row item.
self.table = table
if colMap is None:
colMap = {}
self.colMap = colMap
self.colMapKeys = colMap.keys()
def __getattr__(self, name):
try: return getattr(self.row, self.colMap[name])
except KeyError, e: raise AttributeError, "unable to get '%s'" % name
def __setattr__(self, name, value):
# FIXME: I can't even think about how much performances suffer,
# for this horrible hack (and it's used so rarely...)
# For sure something like a "property" to map column names
# to getter/setter functions would be much better, but it's
# not possible (or at least not easy) to build them for a
# single instance.
if name in self.__dict__.get('colMapKeys', ()):
# Trying to update a value in the database.
row = self.__dict__['row']
table = self.__dict__['table']
colMap = self.__dict__['colMap']
params = {colMap[name]: value}
table.update(table.c.id==row.id).execute(**params)
# XXX: minor bug: after a value is assigned with the
# 'rowAdapterInstance.colName = value' syntax, for some
# reason rowAdapterInstance.colName still returns the
# previous value (even if the database is updated).
# Fix it? I'm not even sure it's ever used.
return
# For every other attribute.
object.__setattr__(self, name, value)
def __repr__(self):
return '<RowAdapter(row=%s, table=%s, colMap=%s) [id=%s]>' % \
(repr(self.row), repr(self.table), repr(self.colMap), id(self))
class ResultAdapter(object):
"""Adapter for a SQLAlchemy ResultProxy object."""
def __init__(self, result, table, colMap=None):
self.result = result
self.table = table
if colMap is None:
colMap = {}
self.colMap = colMap
def count(self):
return len(self)
def __len__(self):
# FIXME: why sqlite returns -1? (that's wrooong!)
if self.result.rowcount == -1:
return 0
return self.result.rowcount
def __getitem__(self, key):
res = list(self.result)[key]
if not isinstance(key, slice):
# A single item.
return RowAdapter(res, self.table, colMap=self.colMap)
else:
# A (possible empty) list of items.
return [RowAdapter(x, self.table, colMap=self.colMap)
for x in res]
def __iter__(self):
for item in self.result:
yield RowAdapter(item, self.table, colMap=self.colMap)
def __repr__(self):
return '<ResultAdapter(result=%s, table=%s, colMap=%s) [id=%s]>' % \
(repr(self.result), repr(self.table),
repr(self.colMap), id(self))
class TableAdapter(object):
"""Adapter for a SQLAlchemy Table object, to mimic a SQLObject class."""
def __init__(self, table, uri=None):
"""Initialize a TableAdapter object."""
self._imdbpySchema = table
self._imdbpyName = table.name
self.connectionURI = uri
self.colMap = {}
columns = []
for col in table.cols:
# Column's paramters.
params = {'nullable': True}
params.update(col.params)
if col.name == 'id':
params['primary_key'] = True
if 'notNone' in params:
params['nullable'] = not params['notNone']
del params['notNone']
cname = _renameColumn(col.name)
self.colMap[col.name] = cname
colClass = MAP_COLS[col.kind]
colKindParams = {}
if 'length' in params:
colKindParams['length'] = params['length']
del params['length']
elif colClass is UnicodeText and col.index:
# XXX: limit length for UNICODECOLs that will have an index.
# this can result in name.name and title.title truncations!
colClass = Unicode
# Should work for most of the database servers.
length = 511
if self.connectionURI:
if self.connectionURI.startswith('mysql'):
# To stay compatible with MySQL 4.x.
length = 255
colKindParams['length'] = length
elif self._imdbpyName == 'PersonInfo' and col.name == 'info':
if self.connectionURI:
if self.connectionURI.startswith('ibm'):
# There are some entries longer than 32KB.
colClass = CLOB
# I really do hope that this space isn't wasted
# for each other shorter entry... <g>
colKindParams['length'] = 68*1024
colKind = colClass(**colKindParams)
if 'alternateID' in params:
# There's no need to handle them here.
del params['alternateID']
# Create a column.
colObj = Column(cname, colKind, **params)
columns.append(colObj)
self.tableName = _renameTable(table.name)
# Create the table.
self.table = Table(self.tableName, metadata, *columns)
self._ta_insert = self.table.insert()
self._ta_select = self.table.select
# Adapters for special attributes.
self.q = QAdapter(self.table, colMap=self.colMap)
self.sqlmeta = SQLMetaAdapter(self.table, colMap=self.colMap)
def select(self, conditions=None):
"""Return a list of results."""
result = self._ta_select(conditions).execute()
return ResultAdapter(result, self.table, colMap=self.colMap)
def get(self, theID):
"""Get an object given its ID."""
result = self.select(self.table.c.id == theID)
#if not result:
# raise NotFoundError, 'no data for ID %s' % theID
# FIXME: isn't this a bit risky? We can't check len(result),
# because sqlite returns -1...
# What about converting it to a list and getting the first item?
try:
return result[0]
except KeyError:
raise NotFoundError, 'no data for ID %s' % theID
def dropTable(self, checkfirst=True):
"""Drop the table."""
dropParams = {'checkfirst': checkfirst}
# Guess what? Another work-around for a ibm_db bug.
if self.table.bind.engine.url.drivername.startswith('ibm_db'):
del dropParams['checkfirst']
try:
self.table.drop(**dropParams)
except exc.ProgrammingError:
# As above: re-raise the exception, but only if it's not ibm_db.
if not self.table.bind.engine.url.drivername.startswith('ibm_db'):
raise
def createTable(self, checkfirst=True):
"""Create the table."""
self.table.create(checkfirst=checkfirst)
# Create indexes for alternateID columns (other indexes will be
# created later, at explicit request for performances reasons).
for col in self._imdbpySchema.cols:
if col.name == 'id':
continue
if col.params.get('alternateID', False):
self._createIndex(col, checkfirst=checkfirst)
def _createIndex(self, col, checkfirst=True):
"""Create an index for a given (schema) column."""
# XXX: indexLen is ignored in SQLAlchemy, and that means that
# indexes will be over the whole 255 chars strings...
# NOTE: don't use a dot as a separator, or DB2 will do
# nasty things.
idx_name = '%s_%s' % (self.table.name, col.index or col.name)
if checkfirst:
for index in self.table.indexes:
if index.name == idx_name:
return
idx = Index(idx_name, getattr(self.table.c, self.colMap[col.name]))
# XXX: beware that exc.OperationalError can be raised, is some
# strange circumstances; that's why the index name doesn't
# follow the SQLObject convention, but includes the table name:
# sqlite, for example, expects index names to be unique at
# db-level.
try:
idx.create()
except exc.OperationalError, e:
_alchemy_logger.warn('Skipping creation of the %s.%s index: %s' %
(self.sqlmeta.table, col.name, e))
def addIndexes(self, ifNotExists=True):
"""Create all required indexes."""
for col in self._imdbpySchema.cols:
if col.index:
self._createIndex(col, checkfirst=ifNotExists)
def addForeignKeys(self, mapTables, ifNotExists=True):
"""Create all required foreign keys."""
if not HAS_MC:
return
# It seems that there's no reason to prevent the creation of
# indexes for columns with FK constrains: if there's already
# an index, the FK index is not created.
countCols = 0
for col in self._imdbpySchema.cols:
countCols += 1
if not col.foreignKey:
continue
fks = col.foreignKey.split('.', 1)
foreignTableName = fks[0]
if len(fks) == 2:
foreignColName = fks[1]
else:
foreignColName = 'id'
foreignColName = mapTables[foreignTableName].colMap.get(
foreignColName, foreignColName)
thisColName = self.colMap.get(col.name, col.name)
thisCol = self.table.columns[thisColName]
foreignTable = mapTables[foreignTableName].table
foreignCol = getattr(foreignTable.c, foreignColName)
# Need to explicitly set an unique name, otherwise it will
# explode, if two cols points to the same table.
fkName = 'fk_%s_%s_%d' % (foreignTable.name, foreignColName,
countCols)
constrain = migrate.changeset.ForeignKeyConstraint([thisCol],
[foreignCol],
name=fkName)
try:
constrain.create()
except exc.OperationalError:
continue
def __call__(self, *args, **kwds):
"""To insert a new row with the syntax: TableClass(key=value, ...)"""
taArgs = {}
for key, value in kwds.items():
taArgs[self.colMap.get(key, key)] = value
self._ta_insert.execute(*args, **taArgs)
def __repr__(self):
return '<TableAdapter(table=%s) [id=%s]>' % (repr(self.table), id(self))
# Module-level "cache" for SQLObject classes, to prevent
# "Table 'tableName' is already defined for this MetaData instance" errors,
# when two or more connections to the database are made.
# XXX: is this the best way to act?
TABLES_REPOSITORY = {}
def getDBTables(uri=None):
"""Return a list of TableAdapter objects to be used to access the
database through the SQLAlchemy ORM. The connection uri is optional, and
can be used to tailor the db schema to specific needs."""
DB_TABLES = []
for table in DB_SCHEMA:
if table.name in TABLES_REPOSITORY:
DB_TABLES.append(TABLES_REPOSITORY[table.name])
continue
tableAdapter = TableAdapter(table, uri)
DB_TABLES.append(tableAdapter)
TABLES_REPOSITORY[table.name] = tableAdapter
return DB_TABLES
# Functions used to emulate SQLObject's logical operators.
def AND(*params):
"""Emulate SQLObject's AND."""
return and_(*params)
def OR(*params):
"""Emulate SQLObject's OR."""
return or_(*params)
def IN(item, inList):
"""Emulate SQLObject's IN."""
if not isinstance(item, schema.Column):
return OR(*[x == item for x in inList])
else:
return item.in_(inList)
def ISNULL(x):
"""Emulate SQLObject's ISNULL."""
# XXX: Should we use null()? Can null() be a global instance?
# XXX: Is it safe to test None with the == operator, in this case?
return x == None
def ISNOTNULL(x):
"""Emulate SQLObject's ISNOTNULL."""
return x != None
def CONTAINSSTRING(expr, pattern):
"""Emulate SQLObject's CONTAINSSTRING."""
return expr.like('%%%s%%' % pattern)
def toUTF8(s):
"""For some strange reason, sometimes SQLObject wants utf8 strings
instead of unicode; with SQLAlchemy we just return the unicode text."""
return s
class _AlchemyConnection(object):
"""A proxy for the connection object, required since _ConnectionFairy
uses __slots__."""
def __init__(self, conn):
self.conn = conn
def __getattr__(self, name):
return getattr(self.conn, name)
def setConnection(uri, tables, encoding='utf8', debug=False):
"""Set connection for every table."""
# FIXME: why on earth MySQL requires an additional parameter,
# is well beyond my understanding...
if uri.startswith('mysql'):
if '?' in uri:
uri += '&'
else:
uri += '?'
uri += 'charset=%s' % encoding
params = {'encoding': encoding}
if debug:
params['echo'] = True
if uri.startswith('ibm_db'):
# Try to work-around a possible bug of the ibm_db DB2 driver.
params['convert_unicode'] = True
# XXX: is this the best way to connect?
engine = create_engine(uri, **params)
metadata.bind = engine
eng_conn = engine.connect()
if uri.startswith('sqlite'):
major = sys.version_info[0]
minor = sys.version_info[1]
if major > 2 or (major == 2 and minor > 5):
eng_conn.connection.connection.text_factory = str
# XXX: OH MY, THAT'S A MESS!
# We need to return a "connection" object, with the .dbName
# attribute set to the db engine name (e.g. "mysql"), .paramstyle
# set to the style of the paramters for query() calls, and the
# .module attribute set to a module (?) with .OperationalError and
# .IntegrityError attributes.
# Another attribute of "connection" is the getConnection() function,
# used to return an object with a .cursor() method.
connection = _AlchemyConnection(eng_conn.connection)
paramstyle = eng_conn.dialect.paramstyle
connection.module = eng_conn.dialect.dbapi
connection.paramstyle = paramstyle
connection.getConnection = lambda: connection.connection
connection.dbName = engine.url.drivername
return connection
-269
View File
@@ -1,269 +0,0 @@
/*
* cutils.c module.
*
* Miscellaneous functions to speed up the IMDbPY package.
*
* Contents:
* - pyratcliff():
* Function that implements the Ratcliff-Obershelp comparison
* amongst Python strings.
*
* - pysoundex():
* Return a soundex code string, for the given string.
*
* Copyright 2004-2009 Davide Alberani <da@erlug.linux.it>
* Released under the GPL license.
*
* NOTE: The Ratcliff-Obershelp part was heavily based on code from the
* "simil" Python module.
* The "simil" module is copyright of Luca Montecchiani <cbm64 _at_ inwind.it>
* and can be found here: http://spazioinwind.libero.it/montecchiani/
* It was released under the GPL license; original comments are leaved
* below.
*
*/
/*========== Ratcliff-Obershelp ==========*/
/*****************************************************************************
*
* Stolen code from :
*
* [Python-Dev] Why is soundex marked obsolete?
* by Eric S. Raymond [4]esr@thyrsus.com
* on Sun, 14 Jan 2001 14:09:01 -0500
*
*****************************************************************************/
/*****************************************************************************
*
* Ratcliff-Obershelp common-subpattern similarity.
*
* This code first appeared in a letter to the editor in Doctor
* Dobbs's Journal, 11/1988. The original article on the algorithm,
* "Pattern Matching by Gestalt" by John Ratcliff, had appeared in the
* July 1988 issue (#181) but the algorithm was presented in assembly.
* The main drawback of the Ratcliff-Obershelp algorithm is the cost
* of the pairwise comparisons. It is significantly more expensive
* than stemming, Hamming distance, soundex, and the like.
*
* Running time quadratic in the data size, memory usage constant.
*
*****************************************************************************/
#include <Python.h>
#define DONTCOMPARE_NULL 0.0
#define DONTCOMPARE_SAME 1.0
#define COMPARE 2.0
#define STRING_MAXLENDIFFER 0.7
/* As of 05 Mar 2008, the longest title is ~600 chars. */
#define MXLINELEN 1023
#define MAX(a,b) ((a) > (b) ? (a) : (b))
//*****************************************
// preliminary check....
//*****************************************
static float
strings_check(char const *s, char const *t)
{
float threshold; // lenght difference
int s_len = strlen(s); // length of s
int t_len = strlen(t); // length of t
// NULL strings ?
if ((t_len * s_len) == 0)
return (DONTCOMPARE_NULL);
// the same ?
if (strcmp(s, t) == 0)
return (DONTCOMPARE_SAME);
// string lenght difference threshold
// we don't want to compare too different lenght strings ;)
if (s_len < t_len)
threshold = (float) s_len / (float) t_len;
else
threshold = (float) t_len / (float) s_len;
if (threshold < STRING_MAXLENDIFFER)
return (DONTCOMPARE_NULL);
// proceed
return (COMPARE);
}
static int
RatcliffObershelp(char *st1, char *end1, char *st2, char *end2)
{
register char *a1, *a2;
char *b1, *b2;
char *s1 = st1, *s2 = st2; /* initializations are just to pacify GCC */
short max, i;
if (end1 <= st1 || end2 <= st2)
return (0);
if (end1 == st1 + 1 && end2 == st2 + 1)
return (0);
max = 0;
b1 = end1;
b2 = end2;
for (a1 = st1; a1 < b1; a1++) {
for (a2 = st2; a2 < b2; a2++) {
if (*a1 == *a2) {
/* determine length of common substring */
for (i = 1; a1[i] && (a1[i] == a2[i]); i++)
continue;
if (i > max) {
max = i;
s1 = a1;
s2 = a2;
b1 = end1 - max;
b2 = end2 - max;
}
}
}
}
if (!max)
return (0);
max += RatcliffObershelp(s1 + max, end1, s2 + max, end2); /* rhs */
max += RatcliffObershelp(st1, s1, st2, s2); /* lhs */
return max;
}
static float
ratcliff(char *s1, char *s2)
/* compute Ratcliff-Obershelp similarity of two strings */
{
int l1, l2;
float res;
// preliminary tests
res = strings_check(s1, s2);
if (res != COMPARE)
return(res);
l1 = strlen(s1);
l2 = strlen(s2);
return 2.0 * RatcliffObershelp(s1, s1 + l1, s2, s2 + l2) / (l1 + l2);
}
/* Change a string to lowercase. */
static void
strtolower(char *s1)
{
int i;
for (i=0; i < strlen(s1); i++) s1[i] = tolower(s1[i]);
}
/* Ratcliff-Obershelp for two python strings; returns a python float. */
static PyObject*
pyratcliff(PyObject *self, PyObject *pArgs)
{
char *s1 = NULL;
char *s2 = NULL;
PyObject *discard = NULL;
char s1copy[MXLINELEN+1];
char s2copy[MXLINELEN+1];
/* The optional PyObject parameter is here to be compatible
* with the pure python implementation, which uses a
* difflib.SequenceMatcher object. */
if (!PyArg_ParseTuple(pArgs, "ss|O", &s1, &s2, &discard))
return NULL;
strncpy(s1copy, s1, MXLINELEN);
strncpy(s2copy, s2, MXLINELEN);
/* Work on copies. */
strtolower(s1copy);
strtolower(s2copy);
return Py_BuildValue("f", ratcliff(s1copy, s2copy));
}
/*========== soundex ==========*/
/* Max length of the soundex code to output (an uppercase char and
* _at most_ 4 digits). */
#define SOUNDEX_LEN 5
/* Group Number Lookup Table */
static char soundTable[26] =
{ 0 /* A */, '1' /* B */, '2' /* C */, '3' /* D */, 0 /* E */, '1' /* F */,
'2' /* G */, 0 /* H */, 0 /* I */, '2' /* J */, '2' /* K */, '4' /* L */,
'5' /* M */, '5' /* N */, 0 /* O */, '1' /* P */, '2' /* Q */, '6' /* R */,
'2' /* S */, '3' /* T */, 0 /* U */, '1' /* V */, 0 /* W */, '2' /* X */,
0 /* Y */, '2' /* Z */};
static PyObject*
pysoundex(PyObject *self, PyObject *pArgs)
{
int i, j, n;
char *s = NULL;
char word[MXLINELEN+1];
char soundCode[SOUNDEX_LEN+1];
char c;
if (!PyArg_ParseTuple(pArgs, "s", &s))
return NULL;
j = 0;
n = strlen(s);
/* Convert to uppercase and exclude non-ascii chars. */
for (i = 0; i < n; i++) {
c = toupper(s[i]);
if (c < 91 && c > 64) {
word[j] = c;
j++;
}
}
word[j] = '\0';
n = strlen(word);
if (n == 0) {
/* If the string is empty, returns None. */
return Py_BuildValue("");
}
soundCode[0] = word[0];
/* Build the soundCode string. */
j = 1;
for (i = 1; j < SOUNDEX_LEN && i < n; i++) {
c = soundTable[(word[i]-65)];
/* Compact zeroes and equal consecutive digits ("12234112"->"123412") */
if (c != 0 && c != soundCode[j-1]) {
soundCode[j++] = c;
}
}
soundCode[j] = '\0';
return Py_BuildValue("s", soundCode);
}
static PyMethodDef cutils_methods[] = {
{"ratcliff", pyratcliff,
METH_VARARGS, "Ratcliff-Obershelp similarity."},
{"soundex", pysoundex,
METH_VARARGS, "Soundex code for strings."},
{NULL}
};
void
initcutils(void)
{
Py_InitModule("cutils", cutils_methods);
}
-461
View File
@@ -1,461 +0,0 @@
#-*- encoding: utf-8 -*-
"""
parser.sql.dbschema module (imdb.parser.sql package).
This module provides the schema used to describe the layout of the
database used by the imdb.parser.sql package; functions to create/drop
tables and indexes are also provided.
Copyright 2005-2010 Davide Alberani <da@erlug.linux.it>
2006 Giuseppe "Cowo" Corbelli <cowo --> lugbs.linux.it>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
import logging
_dbschema_logger = logging.getLogger('imdbpy.parser.sql.dbschema')
# Placeholders for column types.
INTCOL = 1
UNICODECOL = 2
STRINGCOL = 3
_strMap = {1: 'INTCOL', 2: 'UNICODECOL', 3: 'STRINGCOL'}
class DBCol(object):
"""Define column objects."""
def __init__(self, name, kind, **params):
self.name = name
self.kind = kind
self.index = None
self.indexLen = None
# If not None, two notations are accepted: 'TableName'
# and 'TableName.ColName'; in the first case, 'id' is assumed
# as the name of the pointed column.
self.foreignKey = None
if 'index' in params:
self.index = params['index']
del params['index']
if 'indexLen' in params:
self.indexLen = params['indexLen']
del params['indexLen']
if 'foreignKey' in params:
self.foreignKey = params['foreignKey']
del params['foreignKey']
self.params = params
def __str__(self):
"""Class representation."""
s = '<DBCol %s %s' % (self.name, _strMap[self.kind])
if self.index:
s += ' INDEX'
if self.indexLen:
s += '[:%d]' % self.indexLen
if self.foreignKey:
s += ' FOREIGN'
if 'default' in self.params:
val = self.params['default']
if val is not None:
val = '"%s"' % val
s += ' DEFAULT=%s' % val
for param in self.params:
if param == 'default': continue
s += ' %s' % param.upper()
s += '>'
return s
def __repr__(self):
"""Class representation."""
s = '<DBCol(name="%s", %s' % (self.name, _strMap[self.kind])
if self.index:
s += ', index="%s"' % self.index
if self.indexLen:
s += ', indexLen=%d' % self.indexLen
if self.foreignKey:
s += ', foreignKey="%s"' % self.foreignKey
for param in self.params:
val = self.params[param]
if isinstance(val, (unicode, str)):
val = u'"%s"' % val
s += ', %s=%s' % (param, val)
s += ')>'
return s
class DBTable(object):
"""Define table objects."""
def __init__(self, name, *cols, **kwds):
self.name = name
self.cols = cols
# Default values.
self.values = kwds.get('values', {})
def __str__(self):
"""Class representation."""
return '<DBTable %s (%d cols, %d values)>' % (self.name,
len(self.cols), sum([len(v) for v in self.values.values()]))
def __repr__(self):
"""Class representation."""
s = '<DBTable(name="%s"' % self.name
col_s = ', '.join([repr(col).rstrip('>').lstrip('<')
for col in self.cols])
if col_s:
s += ', %s' % col_s
if self.values:
s += ', values=%s' % self.values
s += ')>'
return s
# Default values to insert in some tables: {'column': (list, of, values, ...)}
kindTypeDefs = {'kind': ('movie', 'tv series', 'tv movie', 'video movie',
'tv mini series', 'video game', 'episode')}
companyTypeDefs = {'kind': ('distributors', 'production companies',
'special effects companies', 'miscellaneous companies')}
infoTypeDefs = {'info': ('runtimes', 'color info', 'genres', 'languages',
'certificates', 'sound mix', 'tech info', 'countries', 'taglines',
'keywords', 'alternate versions', 'crazy credits', 'goofs',
'soundtrack', 'quotes', 'release dates', 'trivia', 'locations',
'mini biography', 'birth notes', 'birth date', 'height',
'death date', 'spouse', 'other works', 'birth name',
'salary history', 'nick names', 'books', 'agent address',
'biographical movies', 'portrayed in', 'where now', 'trade mark',
'interviews', 'article', 'magazine cover photo', 'pictorial',
'death notes', 'LD disc format', 'LD year', 'LD digital sound',
'LD official retail price', 'LD frequency response', 'LD pressing plant',
'LD length', 'LD language', 'LD review', 'LD spaciality', 'LD release date',
'LD production country', 'LD contrast', 'LD color rendition',
'LD picture format', 'LD video noise', 'LD video artifacts',
'LD release country', 'LD sharpness', 'LD dynamic range',
'LD audio noise', 'LD color information', 'LD group genre',
'LD quality program', 'LD close captions-teletext-ld-g',
'LD category', 'LD analog left', 'LD certification',
'LD audio quality', 'LD video quality', 'LD aspect ratio',
'LD analog right', 'LD additional information',
'LD number of chapter stops', 'LD dialogue intellegibility',
'LD disc size', 'LD master format', 'LD subtitles',
'LD status of availablility', 'LD quality of source',
'LD number of sides', 'LD video standard', 'LD supplement',
'LD original title', 'LD sound encoding', 'LD number', 'LD label',
'LD catalog number', 'LD laserdisc title', 'screenplay-teleplay',
'novel', 'adaption', 'book', 'production process protocol',
'printed media reviews', 'essays', 'other literature', 'mpaa',
'plot', 'votes distribution', 'votes', 'rating',
'production dates', 'copyright holder', 'filming dates', 'budget',
'weekend gross', 'gross', 'opening weekend', 'rentals',
'admissions', 'studios', 'top 250 rank', 'bottom 10 rank')}
compCastTypeDefs = {'kind': ('cast', 'crew', 'complete', 'complete+verified')}
linkTypeDefs = {'link': ('follows', 'followed by', 'remake of', 'remade as',
'references', 'referenced in', 'spoofs', 'spoofed in',
'features', 'featured in', 'spin off from', 'spin off',
'version of', 'similar to', 'edited into',
'edited from', 'alternate language version of',
'unknown link')}
roleTypeDefs = {'role': ('actor', 'actress', 'producer', 'writer',
'cinematographer', 'composer', 'costume designer',
'director', 'editor', 'miscellaneous crew',
'production designer', 'guest')}
# Schema of tables in our database.
# XXX: Foreign keys can be used to create constrains between tables,
# but they create indexes in the database, and this
# means poor performances at insert-time.
DB_SCHEMA = [
DBTable('Name',
# namePcodeCf is the soundex of the name in the canonical format.
# namePcodeNf is the soundex of the name in the normal format, if
# different from namePcodeCf.
# surnamePcode is the soundex of the surname, if different from the
# other two values.
# The 'id' column is simply skipped by SQLObject (it's a default);
# the alternateID attribute here will be ignored by SQLAlchemy.
DBCol('id', INTCOL, notNone=True, alternateID=True),
DBCol('name', UNICODECOL, notNone=True, index='idx_name', indexLen=6),
DBCol('imdbIndex', UNICODECOL, length=12, default=None),
DBCol('imdbID', INTCOL, default=None),
DBCol('namePcodeCf', STRINGCOL, length=5, default=None,
index='idx_pcodecf'),
DBCol('namePcodeNf', STRINGCOL, length=5, default=None,
index='idx_pcodenf'),
DBCol('surnamePcode', STRINGCOL, length=5, default=None,
index='idx_pcode'),
DBCol('md5sum', STRINGCOL, length=32, default=None, index='idx_md5')
),
DBTable('CharName',
# namePcodeNf is the soundex of the name in the normal format.
# surnamePcode is the soundex of the surname, if different
# from namePcodeNf.
DBCol('id', INTCOL, notNone=True, alternateID=True),
DBCol('name', UNICODECOL, notNone=True, index='idx_name', indexLen=6),
DBCol('imdbIndex', UNICODECOL, length=12, default=None),
DBCol('imdbID', INTCOL, default=None),
DBCol('namePcodeNf', STRINGCOL, length=5, default=None,
index='idx_pcodenf'),
DBCol('surnamePcode', STRINGCOL, length=5, default=None,
index='idx_pcode'),
DBCol('md5sum', STRINGCOL, length=32, default=None, index='idx_md5')
),
DBTable('CompanyName',
# namePcodeNf is the soundex of the name in the normal format.
# namePcodeSf is the soundex of the name plus the country code.
DBCol('id', INTCOL, notNone=True, alternateID=True),
DBCol('name', UNICODECOL, notNone=True, index='idx_name', indexLen=6),
DBCol('countryCode', UNICODECOL, length=255, default=None),
DBCol('imdbID', INTCOL, default=None),
DBCol('namePcodeNf', STRINGCOL, length=5, default=None,
index='idx_pcodenf'),
DBCol('namePcodeSf', STRINGCOL, length=5, default=None,
index='idx_pcodesf'),
DBCol('md5sum', STRINGCOL, length=32, default=None, index='idx_md5')
),
DBTable('KindType',
DBCol('id', INTCOL, notNone=True, alternateID=True),
DBCol('kind', STRINGCOL, length=15, default=None, alternateID=True),
values=kindTypeDefs
),
DBTable('Title',
DBCol('id', INTCOL, notNone=True, alternateID=True),
DBCol('title', UNICODECOL, notNone=True,
index='idx_title', indexLen=10),
DBCol('imdbIndex', UNICODECOL, length=12, default=None),
DBCol('kindID', INTCOL, notNone=True, foreignKey='KindType'),
DBCol('productionYear', INTCOL, default=None),
DBCol('imdbID', INTCOL, default=None),
DBCol('phoneticCode', STRINGCOL, length=5, default=None,
index='idx_pcode'),
DBCol('episodeOfID', INTCOL, default=None, index='idx_epof',
foreignKey='Title'),
DBCol('seasonNr', INTCOL, default=None),
DBCol('episodeNr', INTCOL, default=None),
# Maximum observed length is 44; 49 can store 5 comma-separated
# year-year pairs.
DBCol('seriesYears', STRINGCOL, length=49, default=None),
DBCol('md5sum', STRINGCOL, length=32, default=None, index='idx_md5')
),
DBTable('CompanyType',
DBCol('id', INTCOL, notNone=True, alternateID=True),
DBCol('kind', STRINGCOL, length=32, default=None, alternateID=True),
values=companyTypeDefs
),
DBTable('AkaName',
DBCol('id', INTCOL, notNone=True, alternateID=True),
DBCol('personID', INTCOL, notNone=True, index='idx_person',
foreignKey='Name'),
DBCol('name', UNICODECOL, notNone=True),
DBCol('imdbIndex', UNICODECOL, length=12, default=None),
DBCol('namePcodeCf', STRINGCOL, length=5, default=None,
index='idx_pcodecf'),
DBCol('namePcodeNf', STRINGCOL, length=5, default=None,
index='idx_pcodenf'),
DBCol('surnamePcode', STRINGCOL, length=5, default=None,
index='idx_pcode'),
DBCol('md5sum', STRINGCOL, length=32, default=None, index='idx_md5')
),
DBTable('AkaTitle',
# XXX: It's safer to set notNone to False, here.
# alias for akas are stored completely in the AkaTitle table;
# this means that episodes will set also a "tv series" alias name.
# Reading the aka-title.list file it looks like there are
# episode titles with aliases to different titles for both
# the episode and the series title, while for just the series
# there are no aliases.
# E.g.:
# aka title original title
# "Series, The" (2005) {The Episode} "Other Title" (2005) {Other Title}
# But there is no:
# "Series, The" (2005) "Other Title" (2005)
DBCol('id', INTCOL, notNone=True, alternateID=True),
DBCol('movieID', INTCOL, notNone=True, index='idx_movieid',
foreignKey='Title'),
DBCol('title', UNICODECOL, notNone=True),
DBCol('imdbIndex', UNICODECOL, length=12, default=None),
DBCol('kindID', INTCOL, notNone=True, foreignKey='KindType'),
DBCol('productionYear', INTCOL, default=None),
DBCol('phoneticCode', STRINGCOL, length=5, default=None,
index='idx_pcode'),
DBCol('episodeOfID', INTCOL, default=None, index='idx_epof',
foreignKey='AkaTitle'),
DBCol('seasonNr', INTCOL, default=None),
DBCol('episodeNr', INTCOL, default=None),
DBCol('note', UNICODECOL, default=None),
DBCol('md5sum', STRINGCOL, length=32, default=None, index='idx_md5')
),
DBTable('RoleType',
DBCol('id', INTCOL, notNone=True, alternateID=True),
DBCol('role', STRINGCOL, length=32, notNone=True, alternateID=True),
values=roleTypeDefs
),
DBTable('CastInfo',
DBCol('id', INTCOL, notNone=True, alternateID=True),
DBCol('personID', INTCOL, notNone=True, index='idx_pid',
foreignKey='Name'),
DBCol('movieID', INTCOL, notNone=True, index='idx_mid',
foreignKey='Title'),
DBCol('personRoleID', INTCOL, default=None, index='idx_cid',
foreignKey='CharName'),
DBCol('note', UNICODECOL, default=None),
DBCol('nrOrder', INTCOL, default=None),
DBCol('roleID', INTCOL, notNone=True, foreignKey='RoleType')
),
DBTable('CompCastType',
DBCol('id', INTCOL, notNone=True, alternateID=True),
DBCol('kind', STRINGCOL, length=32, notNone=True, alternateID=True),
values=compCastTypeDefs
),
DBTable('CompleteCast',
DBCol('id', INTCOL, notNone=True, alternateID=True),
DBCol('movieID', INTCOL, index='idx_mid', foreignKey='Title'),
DBCol('subjectID', INTCOL, notNone=True, foreignKey='CompCastType'),
DBCol('statusID', INTCOL, notNone=True, foreignKey='CompCastType')
),
DBTable('InfoType',
DBCol('id', INTCOL, notNone=True, alternateID=True),
DBCol('info', STRINGCOL, length=32, notNone=True, alternateID=True),
values=infoTypeDefs
),
DBTable('LinkType',
DBCol('id', INTCOL, notNone=True, alternateID=True),
DBCol('link', STRINGCOL, length=32, notNone=True, alternateID=True),
values=linkTypeDefs
),
DBTable('Keyword',
DBCol('id', INTCOL, notNone=True, alternateID=True),
# XXX: can't use alternateID=True, because it would create
# a UNIQUE index; unfortunately (at least with a common
# collation like utf8_unicode_ci) MySQL will consider
# some different keywords identical - like
# "fiancée" and "fiancee".
DBCol('keyword', UNICODECOL, length=255, notNone=True,
index='idx_keyword', indexLen=5),
DBCol('phoneticCode', STRINGCOL, length=5, default=None,
index='idx_pcode')
),
DBTable('MovieKeyword',
DBCol('id', INTCOL, notNone=True, alternateID=True),
DBCol('movieID', INTCOL, notNone=True, index='idx_mid',
foreignKey='Title'),
DBCol('keywordID', INTCOL, notNone=True, index='idx_keywordid',
foreignKey='Keyword')
),
DBTable('MovieLink',
DBCol('id', INTCOL, notNone=True, alternateID=True),
DBCol('movieID', INTCOL, notNone=True, index='idx_mid',
foreignKey='Title'),
DBCol('linkedMovieID', INTCOL, notNone=True, foreignKey='Title'),
DBCol('linkTypeID', INTCOL, notNone=True, foreignKey='LinkType')
),
DBTable('MovieInfo',
DBCol('id', INTCOL, notNone=True, alternateID=True),
DBCol('movieID', INTCOL, notNone=True, index='idx_mid',
foreignKey='Title'),
DBCol('infoTypeID', INTCOL, notNone=True, foreignKey='InfoType'),
DBCol('info', UNICODECOL, notNone=True),
DBCol('note', UNICODECOL, default=None)
),
# This table is identical to MovieInfo, except that both 'infoTypeID'
# and 'info' are indexed.
DBTable('MovieInfoIdx',
DBCol('id', INTCOL, notNone=True, alternateID=True),
DBCol('movieID', INTCOL, notNone=True, index='idx_mid',
foreignKey='Title'),
DBCol('infoTypeID', INTCOL, notNone=True, index='idx_infotypeid',
foreignKey='InfoType'),
DBCol('info', UNICODECOL, notNone=True, index='idx_info', indexLen=10),
DBCol('note', UNICODECOL, default=None)
),
DBTable('MovieCompanies',
DBCol('id', INTCOL, notNone=True, alternateID=True),
DBCol('movieID', INTCOL, notNone=True, index='idx_mid',
foreignKey='Title'),
DBCol('companyID', INTCOL, notNone=True, index='idx_cid',
foreignKey='CompanyName'),
DBCol('companyTypeID', INTCOL, notNone=True, foreignKey='CompanyType'),
DBCol('note', UNICODECOL, default=None)
),
DBTable('PersonInfo',
DBCol('id', INTCOL, notNone=True, alternateID=True),
DBCol('personID', INTCOL, notNone=True, index='idx_pid',
foreignKey='Name'),
DBCol('infoTypeID', INTCOL, notNone=True, foreignKey='InfoType'),
DBCol('info', UNICODECOL, notNone=True),
DBCol('note', UNICODECOL, default=None)
)
]
# Functions to manage tables.
def dropTables(tables, ifExists=True):
"""Drop the tables."""
# In reverse order (useful to avoid errors about foreign keys).
DB_TABLES_DROP = list(tables)
DB_TABLES_DROP.reverse()
for table in DB_TABLES_DROP:
_dbschema_logger.info('dropping table %s', table._imdbpyName)
table.dropTable(ifExists)
def createTables(tables, ifNotExists=True):
"""Create the tables and insert default values."""
for table in tables:
# Create the table.
_dbschema_logger.info('creating table %s', table._imdbpyName)
table.createTable(ifNotExists)
# Insert default values, if any.
if table._imdbpySchema.values:
_dbschema_logger.info('inserting values into table %s',
table._imdbpyName)
for key in table._imdbpySchema.values:
for value in table._imdbpySchema.values[key]:
table(**{key: unicode(value)})
def createIndexes(tables, ifNotExists=True):
"""Create the indexes in the database."""
for table in tables:
_dbschema_logger.info('creating indexes for table %s',
table._imdbpyName)
table.addIndexes(ifNotExists)
def createForeignKeys(tables, ifNotExists=True):
"""Create Foreign Keys."""
mapTables = {}
for table in tables:
mapTables[table._imdbpyName] = table
for table in tables:
_dbschema_logger.info('creating foreign keys for table %s',
table._imdbpyName)
table.addForeignKeys(mapTables, ifNotExists)
-203
View File
@@ -1,203 +0,0 @@
"""
parser.sql.objectadapter module (imdb.parser.sql package).
This module adapts the SQLObject ORM to the internal mechanism.
Copyright 2008-2010 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
import sys
import logging
from sqlobject import *
from sqlobject.sqlbuilder import ISNULL, ISNOTNULL, AND, OR, IN, CONTAINSSTRING
from dbschema import *
_object_logger = logging.getLogger('imdbpy.parser.sql.object')
# Maps our placeholders to SQLAlchemy's column types.
MAP_COLS = {
INTCOL: IntCol,
UNICODECOL: UnicodeCol,
STRINGCOL: StringCol
}
# Exception raised when Table.get(id) returns no value.
NotFoundError = SQLObjectNotFound
# class method to be added to the SQLObject class.
def addIndexes(cls, ifNotExists=True):
"""Create all required indexes."""
for col in cls._imdbpySchema.cols:
if col.index:
idxName = col.index
colToIdx = col.name
if col.indexLen:
colToIdx = {'column': col.name, 'length': col.indexLen}
if idxName in [i.name for i in cls.sqlmeta.indexes]:
# Check if the index is already present.
continue
idx = DatabaseIndex(colToIdx, name=idxName)
cls.sqlmeta.addIndex(idx)
try:
cls.createIndexes(ifNotExists)
except dberrors.OperationalError, e:
_object_logger.warn('Skipping creation of the %s.%s index: %s' %
(cls.sqlmeta.table, col.name, e))
addIndexes = classmethod(addIndexes)
# Global repository for "fake" tables with Foreign Keys - need to
# prevent troubles if addForeignKeys is called more than one time.
FAKE_TABLES_REPOSITORY = {}
def _buildFakeFKTable(cls, fakeTableName):
"""Return a "fake" table, with foreign keys where needed."""
countCols = 0
attrs = {}
for col in cls._imdbpySchema.cols:
countCols += 1
if col.name == 'id':
continue
if not col.foreignKey:
# A non-foreign key column - add it as usual.
attrs[col.name] = MAP_COLS[col.kind](**col.params)
continue
# XXX: Foreign Keys pointing to TableName.ColName not yet supported.
thisColName = col.name
if thisColName.endswith('ID'):
thisColName = thisColName[:-2]
fks = col.foreignKey.split('.', 1)
foreignTableName = fks[0]
if len(fks) == 2:
foreignColName = fks[1]
else:
foreignColName = 'id'
# Unused...
#fkName = 'fk_%s_%s_%d' % (foreignTableName, foreignColName,
# countCols)
# Create a Foreign Key column, with the correct references.
fk = ForeignKey(foreignTableName, name=thisColName, default=None)
attrs[thisColName] = fk
# Build a _NEW_ SQLObject subclass, with foreign keys, if needed.
newcls = type(fakeTableName, (SQLObject,), attrs)
return newcls
def addForeignKeys(cls, mapTables, ifNotExists=True):
"""Create all required foreign keys."""
# Do not even try, if there are no FK, in this table.
if not filter(None, [col.foreignKey for col in cls._imdbpySchema.cols]):
return
fakeTableName = 'myfaketable%s' % cls.sqlmeta.table
if fakeTableName in FAKE_TABLES_REPOSITORY:
newcls = FAKE_TABLES_REPOSITORY[fakeTableName]
else:
newcls = _buildFakeFKTable(cls, fakeTableName)
FAKE_TABLES_REPOSITORY[fakeTableName] = newcls
# Connect the class with foreign keys.
newcls.setConnection(cls._connection)
for col in cls._imdbpySchema.cols:
if col.name == 'id':
continue
if not col.foreignKey:
continue
# Get the SQL that _WOULD BE_ run, if we had to create
# this "fake" table.
fkQuery = newcls._connection.createReferenceConstraint(newcls,
newcls.sqlmeta.columns[col.name])
if not fkQuery:
# Probably the db doesn't support foreign keys (SQLite).
continue
# Remove "myfaketable" to get references to _real_ tables.
fkQuery = fkQuery.replace('myfaketable', '')
# Execute the query.
newcls._connection.query(fkQuery)
# Disconnect it.
newcls._connection.close()
addForeignKeys = classmethod(addForeignKeys)
# Module-level "cache" for SQLObject classes, to prevent
# "class TheClass is already in the registry" errors, when
# two or more connections to the database are made.
# XXX: is this the best way to act?
TABLES_REPOSITORY = {}
def getDBTables(uri=None):
"""Return a list of classes to be used to access the database
through the SQLObject ORM. The connection uri is optional, and
can be used to tailor the db schema to specific needs."""
DB_TABLES = []
for table in DB_SCHEMA:
if table.name in TABLES_REPOSITORY:
DB_TABLES.append(TABLES_REPOSITORY[table.name])
continue
attrs = {'_imdbpyName': table.name, '_imdbpySchema': table,
'addIndexes': addIndexes, 'addForeignKeys': addForeignKeys}
for col in table.cols:
if col.name == 'id':
continue
attrs[col.name] = MAP_COLS[col.kind](**col.params)
# Create a subclass of SQLObject.
# XXX: use a metaclass? I can't see any advantage.
cls = type(table.name, (SQLObject,), attrs)
DB_TABLES.append(cls)
TABLES_REPOSITORY[table.name] = cls
return DB_TABLES
def toUTF8(s):
"""For some strange reason, sometimes SQLObject wants utf8 strings
instead of unicode."""
return s.encode('utf_8')
def setConnection(uri, tables, encoding='utf8', debug=False):
"""Set connection for every table."""
kw = {}
# FIXME: it's absolutely unclear what we should do to correctly
# support unicode in MySQL; with some versions of SQLObject,
# it seems that setting use_unicode=1 is the _wrong_ thing to do.
_uriLower = uri.lower()
if _uriLower.startswith('mysql'):
kw['use_unicode'] = 1
#kw['sqlobject_encoding'] = encoding
kw['charset'] = encoding
conn = connectionForURI(uri, **kw)
conn.debug = debug
if uri.startswith('sqlite'):
major = sys.version_info[0]
minor = sys.version_info[1]
if major > 2 or (major == 2 and minor > 5):
conn.connection.connection.text_factory = str
for table in tables:
table.setConnection(conn)
#table.sqlmeta.cacheValues = False
# FIXME: is it safe to set table._cacheValue to False? Looks like
# we can't retrieve correct values after an update (I think
# it's never needed, but...) Anyway, these are set to False
# for performance reason at insert time (see imdbpy2sql.py).
table._cacheValue = False
# Required by imdbpy2sql.py.
conn.paramstyle = conn.module.paramstyle
return conn
-1545
View File
File diff suppressed because it is too large Load Diff