Re-use imdb page parser

2014-05-19 19:25:16 +02:00
parent 577bf09859
commit 08f55314d5
1 changed files with 55 additions and 78 deletions
--- a/couchpotato/core/media/movie/providers/automation/imdb.py
+++ b/couchpotato/core/media/movie/providers/automation/imdb.py
@@ -28,6 +28,34 @@ class IMDBBase(Automation, RSS):
    def getInfo(self, imdb_id):
        return fireEvent('movie.info', identifier = imdb_id, extended = False, merge = True)

+    def getFromURL(self, url):
+        log.debug('Getting IMDBs from: %s', url)
+        html = self.getHTMLData(url)
+
+        try:
+            split = splitString(html, split_on = "<div class=\"list compact\">")[1]
+            html = splitString(split, split_on = "<div class=\"pages\">")[0]
+        except:
+            try:
+                split = splitString(html, split_on = "<div id=\"main\">")
+                html = BeautifulSoup(split[1])
+                for x in ['list compact', 'lister', 'list detail sub-list']:
+                    html2 = html.find('div', attrs = {
+                        'class': x
+                    })
+
+                    if html2:
+                        html = html2.contents
+                        html = ''.join([str(x) for x in html])
+                        break
+            except:
+                log.error('Failed parsing IMDB page "%s": %s', (url, traceback.format_exc()))
+
+        html = str(html)
+        imdbs = getImdb(html, multiple = True) if html else []
+
+        return imdbs
+

 class IMDBWatchlist(IMDBBase):

@@ -65,22 +93,7 @@ class IMDBWatchlist(IMDBBase):
                try:

                    w_url = '%s&start=%s' % (watchlist_url, start)
-                    log.debug('Started IMDB watchlists: %s', w_url)
-                    html = self.getHTMLData(w_url)
-
-                    try:
-                        split = splitString(html, split_on="<div id=\"main\">")
-                        html2 = BeautifulSoup(split[1])
-                        html = html2.find('div', attrs = {'class': 'list compact'}).contents
-                        html = ''.join([str(x) for x in html])
-                    except:
-                        try:
-                            split = splitString(html, split_on="<div class=\"list compact\">")[1]
-                            html = splitString(split, split_on="<div class=\"pages\">")[0]
-                        except:
-                            pass
-
-                    imdbs = getImdb(html, multiple = True) if html else []
+                    imdbs = self.getFromURL(w_url)

                    for imdb in imdbs:
                        if imdb not in movies:
@@ -115,12 +128,12 @@ class IMDBAutomation(IMDBBase):
        'boxoffice': {
            'order': 2,
            'name': 'IMDB - Box Office',
-            'url': 'http://www.imdb.com/chart/',
+            'url': 'http://www.imdb.com/boxoffice/',
        },
        'rentals': {
            'order': 3,
            'name': 'IMDB - Top DVD rentals',
-            'url': 'http://m.imdb.com/boxoffice_json',
+            'url': 'http://www.imdb.com/boxoffice/rentals',
            'type': 'json',
        },
        'top250': {
@@ -130,8 +143,6 @@ class IMDBAutomation(IMDBBase):
        },
    }

-    first_table = ['boxoffice']
-
    def getIMDBids(self):

        movies = []
@@ -141,36 +152,19 @@ class IMDBAutomation(IMDBBase):
            url = chart.get('url')

            if self.conf('automation_charts_%s' % name):
-                data = self.getHTMLData(url)
+                imdb_ids = self.getFromURL(url)

-                if data:
-                    try:
-                        html = BeautifulSoup(data)
+                try:
+                    for imdb_id in imdb_ids:
+                        info = self.getInfo(imdb_id)
+                        if info and self.isMinimalMovie(info):
+                            movies.append(imdb_id)

-                        if chart.get('type', 'html') == 'html':
-                            result_div = html.find('div', attrs = {'id': 'main'})
+                        if self.shuttingDown():
+                            break

-                            try:
-                                if url in self.first_table:
-                                    table = result_div.find('table')
-                                    result_div = table if table else result_div
-                            except:
-                                pass
-
-                            imdb_ids = getImdb(str(result_div), multiple = True)
-                        else:
-                            imdb_ids = getImdb(str(data), multiple = True)
-
-                        for imdb_id in imdb_ids:
-                            info = self.getInfo(imdb_id)
-                            if info and self.isMinimalMovie(info):
-                                movies.append(imdb_id)
-
-                            if self.shuttingDown():
-                                break
-
-                    except:
-                        log.error('Failed loading IMDB chart results from %s: %s', (url, traceback.format_exc()))
+                except:
+                    log.error('Failed loading IMDB chart results from %s: %s', (url, traceback.format_exc()))

        return movies

@@ -188,42 +182,25 @@ class IMDBAutomation(IMDBBase):

                chart['list'] = []

-                data = self.getHTMLData(url)
-                if data:
-                    html = BeautifulSoup(data)
+                imdb_ids = self.getFromURL(url)

-                    try:
+                try:
+                    for imdb_id in imdb_ids[0:max_items]:

-                        if chart.get('type', 'html') == 'html':
-                            result_div = html.find('div', attrs = {'id': 'main'})
+                        is_movie = fireEvent('movie.is_movie', identifier = imdb_id, single = True)
+                        if not is_movie:
+                            continue

-                            try:
-                                if url in self.first_table:
-                                    table = result_div.find('table')
-                                    result_div = table if table else result_div
-                            except:
-                                pass
+                        info = self.getInfo(imdb_id)
+                        chart['list'].append(info)

-                            imdb_ids = getImdb(str(result_div), multiple = True)
-                        else:
-                            imdb_ids = getImdb(str(data), multiple = True)
+                        if self.shuttingDown():
+                            break
+                except:
+                    log.error('Failed loading IMDB chart results from %s: %s', (url, traceback.format_exc()))

-                        for imdb_id in imdb_ids[0:max_items]:
-
-                            is_movie = fireEvent('movie.is_movie', identifier = imdb_id, single = True)
-                            if not is_movie:
-                                continue
-
-                            info = self.getInfo(imdb_id)
-                            chart['list'].append(info)
-
-                            if self.shuttingDown():
-                                break
-                    except:
-                        log.error('Failed loading IMDB chart results from %s: %s', (url, traceback.format_exc()))
-
-                    if chart['list']:
-                        movie_lists.append(chart)
+                if chart['list']:
+                    movie_lists.append(chart)


        return movie_lists