Re-use imdb page parser

This commit is contained in:
Ruud
2014-05-19 19:25:16 +02:00
parent 577bf09859
commit 08f55314d5

View File

@@ -28,6 +28,34 @@ class IMDBBase(Automation, RSS):
def getInfo(self, imdb_id):
return fireEvent('movie.info', identifier = imdb_id, extended = False, merge = True)
def getFromURL(self, url):
log.debug('Getting IMDBs from: %s', url)
html = self.getHTMLData(url)
try:
split = splitString(html, split_on = "<div class=\"list compact\">")[1]
html = splitString(split, split_on = "<div class=\"pages\">")[0]
except:
try:
split = splitString(html, split_on = "<div id=\"main\">")
html = BeautifulSoup(split[1])
for x in ['list compact', 'lister', 'list detail sub-list']:
html2 = html.find('div', attrs = {
'class': x
})
if html2:
html = html2.contents
html = ''.join([str(x) for x in html])
break
except:
log.error('Failed parsing IMDB page "%s": %s', (url, traceback.format_exc()))
html = str(html)
imdbs = getImdb(html, multiple = True) if html else []
return imdbs
class IMDBWatchlist(IMDBBase):
@@ -65,22 +93,7 @@ class IMDBWatchlist(IMDBBase):
try:
w_url = '%s&start=%s' % (watchlist_url, start)
log.debug('Started IMDB watchlists: %s', w_url)
html = self.getHTMLData(w_url)
try:
split = splitString(html, split_on="<div id=\"main\">")
html2 = BeautifulSoup(split[1])
html = html2.find('div', attrs = {'class': 'list compact'}).contents
html = ''.join([str(x) for x in html])
except:
try:
split = splitString(html, split_on="<div class=\"list compact\">")[1]
html = splitString(split, split_on="<div class=\"pages\">")[0]
except:
pass
imdbs = getImdb(html, multiple = True) if html else []
imdbs = self.getFromURL(w_url)
for imdb in imdbs:
if imdb not in movies:
@@ -115,12 +128,12 @@ class IMDBAutomation(IMDBBase):
'boxoffice': {
'order': 2,
'name': 'IMDB - Box Office',
'url': 'http://www.imdb.com/chart/',
'url': 'http://www.imdb.com/boxoffice/',
},
'rentals': {
'order': 3,
'name': 'IMDB - Top DVD rentals',
'url': 'http://m.imdb.com/boxoffice_json',
'url': 'http://www.imdb.com/boxoffice/rentals',
'type': 'json',
},
'top250': {
@@ -130,8 +143,6 @@ class IMDBAutomation(IMDBBase):
},
}
first_table = ['boxoffice']
def getIMDBids(self):
movies = []
@@ -141,36 +152,19 @@ class IMDBAutomation(IMDBBase):
url = chart.get('url')
if self.conf('automation_charts_%s' % name):
data = self.getHTMLData(url)
imdb_ids = self.getFromURL(url)
if data:
try:
html = BeautifulSoup(data)
try:
for imdb_id in imdb_ids:
info = self.getInfo(imdb_id)
if info and self.isMinimalMovie(info):
movies.append(imdb_id)
if chart.get('type', 'html') == 'html':
result_div = html.find('div', attrs = {'id': 'main'})
if self.shuttingDown():
break
try:
if url in self.first_table:
table = result_div.find('table')
result_div = table if table else result_div
except:
pass
imdb_ids = getImdb(str(result_div), multiple = True)
else:
imdb_ids = getImdb(str(data), multiple = True)
for imdb_id in imdb_ids:
info = self.getInfo(imdb_id)
if info and self.isMinimalMovie(info):
movies.append(imdb_id)
if self.shuttingDown():
break
except:
log.error('Failed loading IMDB chart results from %s: %s', (url, traceback.format_exc()))
except:
log.error('Failed loading IMDB chart results from %s: %s', (url, traceback.format_exc()))
return movies
@@ -188,42 +182,25 @@ class IMDBAutomation(IMDBBase):
chart['list'] = []
data = self.getHTMLData(url)
if data:
html = BeautifulSoup(data)
imdb_ids = self.getFromURL(url)
try:
try:
for imdb_id in imdb_ids[0:max_items]:
if chart.get('type', 'html') == 'html':
result_div = html.find('div', attrs = {'id': 'main'})
is_movie = fireEvent('movie.is_movie', identifier = imdb_id, single = True)
if not is_movie:
continue
try:
if url in self.first_table:
table = result_div.find('table')
result_div = table if table else result_div
except:
pass
info = self.getInfo(imdb_id)
chart['list'].append(info)
imdb_ids = getImdb(str(result_div), multiple = True)
else:
imdb_ids = getImdb(str(data), multiple = True)
if self.shuttingDown():
break
except:
log.error('Failed loading IMDB chart results from %s: %s', (url, traceback.format_exc()))
for imdb_id in imdb_ids[0:max_items]:
is_movie = fireEvent('movie.is_movie', identifier = imdb_id, single = True)
if not is_movie:
continue
info = self.getInfo(imdb_id)
chart['list'].append(info)
if self.shuttingDown():
break
except:
log.error('Failed loading IMDB chart results from %s: %s', (url, traceback.format_exc()))
if chart['list']:
movie_lists.append(chart)
if chart['list']:
movie_lists.append(chart)
return movie_lists