Update GuessIt

2013-09-17 21:04:15 +02:00
parent 5f5f17112a
commit ad01a3da4d
11 changed files with 179 additions and 50 deletions
--- a/libs/guessit/init.py
+++ b/libs/guessit/init.py
@@ -20,7 +20,7 @@

 from __future__ import unicode_literals

-__version__ = '0.6-dev'
+__version__ = '0.7-dev'
 __all__ = ['Guess', 'Language',
           'guess_file_info', 'guess_video_info',
           'guess_movie_info', 'guess_episode_info']
@@ -91,7 +91,28 @@ log.addHandler(h)


 def _guess_filename(filename, filetype):
+    def find_nodes(tree, props):
+        """Yields all nodes containing any of the given props."""
+        if isinstance(props, base_text_type):
+            props = [props]
+        for node in tree.nodes():
+            if any(prop in node.guess for prop in props):
+                yield node
+
+    def warning(title):
+        log.warning('%s, guesses: %s - %s' % (title, m.nice_string(), m2.nice_string()))
+        return m
+
    mtree = IterativeMatcher(filename, filetype=filetype)
+
+    # if there are multiple possible years found, we assume the first one is
+    # part of the title, reparse the tree taking this into account
+    years = set(n.value for n in find_nodes(mtree.match_tree, 'year'))
+    if len(years) >= 2:
+        mtree = IterativeMatcher(filename, filetype=filetype,
+                                 opts=['skip_first_year'])
+
+
    m = mtree.matched()

    if 'language' not in m and 'subtitleLanguage' not in m:
@@ -102,20 +123,10 @@ def _guess_filename(filename, filetype):
                              opts=['nolanguage', 'nocountry'])
    m2 = mtree2.matched()

-    def find_nodes(tree, props):
-        """Yields all nodes containing any of the given props."""
-        if isinstance(props, base_text_type):
-            props = [props]
-        for node in tree.nodes():
-            if any(prop in node.guess for prop in props):
-                yield node

-
-    def warning(title):
-        log.warning('%s, guesses: %s - %s' % (title, m.nice_string(), m2.nice_string()))
+    if m.get('title') is None:
        return m

-
    if m.get('title') != m2.get('title'):
        title = next(find_nodes(mtree.match_tree, 'title'))
        title2 = next(find_nodes(mtree2.match_tree, 'title'))
--- a/libs/guessit/fileutils.py
+++ b/libs/guessit/fileutils.py
@@ -77,12 +77,12 @@ def file_in_same_dir(ref_file, desired_file):

 def load_file_in_same_dir(ref_file, filename):
    """Load a given file. Works even when the file is contained inside a zip."""
-    path = split_path(ref_file)[:-1] + [str(filename)]
+    path = split_path(ref_file)[:-1] + [filename]

    for i, p in enumerate(path):
-        if p[-4:] == '.zip':
+        if p.endswith('.zip'):
            zfilename = os.path.join(*path[:i + 1])
            zfile = zipfile.ZipFile(zfilename)
            return zfile.read('/'.join(path[i + 1:]))

-    return u(io.open(os.path.join(*path), encoding = 'utf-8').read())
+    return u(io.open(os.path.join(*path), encoding='utf-8').read())
--- a/libs/guessit/guess.py
+++ b/libs/guessit/guess.py
@@ -295,7 +295,7 @@ def merge_all(guesses, append=None):
        # then merge the remaining ones
        dups = set(result) & set(g)
        if dups:
-            log.warning('duplicate properties %s in merged result...' % dups)
+            log.warning('duplicate properties %s in merged result...' % [ (result[p], g[p]) for p in dups] )

        result.update_highest_confidence(g)

--- a/libs/guessit/language.py
+++ b/libs/guessit/language.py
@@ -326,7 +326,7 @@ def search_language(string, lang_filter=None):
        'la', 'el', 'del', 'por', 'mar',
        # other
        'ind', 'arw', 'ts', 'ii', 'bin', 'chan', 'ss', 'san', 'oss', 'iii',
-        'vi', 'ben', 'da'
+        'vi', 'ben', 'da', 'lt'
        ])
    sep = r'[](){} \._-+'

--- a/libs/guessit/matcher.py
+++ b/libs/guessit/matcher.py
@@ -128,12 +128,14 @@ class IterativeMatcher(object):
            apply_transfo(name)

        # more guessers for both movies and episodes
-        for name in ['guess_bonus_features', 'guess_year']:
-            apply_transfo(name)
+        apply_transfo('guess_bonus_features')
+        apply_transfo('guess_year', skip_first_year=('skip_first_year' in opts))

        if 'nocountry' not in opts:
            apply_transfo('guess_country')

+        apply_transfo('guess_idnumber')
+

        # split into '-' separated subgroups (with required separator chars
        # around the dash)
--- a/libs/guessit/matchtree.py
+++ b/libs/guessit/matchtree.py
@@ -275,7 +275,7 @@ class MatchTree(BaseMatchTree):
        for string_part in ('title', 'series', 'container', 'format',
                            'releaseGroup', 'website', 'audioCodec',
                            'videoCodec', 'screenSize', 'episodeFormat',
-                            'audioChannels'):
+                            'audioChannels', 'idNumber'):
            merge_similar_guesses(parts, string_part, choose_string)

        # 2- merge the rest, potentially discarding information not properly
--- a/libs/guessit/patterns.py
+++ b/libs/guessit/patterns.py
@@ -43,13 +43,13 @@ episode_rexps = [ # ... Season 2 ...
                  (r'saison (?P<season>[0-9]+)', 1.0, (0, 0)),

                  # ... s02e13 ...
-                  (r'[Ss](?P<season>[0-9]{1,2}).?(?P<episodeNumber>(?:[Ee-][0-9]{1,2})+)[^0-9]', 1.0, (0, -1)),
+                  (r'[Ss](?P<season>[0-9]{1,3})[^0-9]?(?P<episodeNumber>(?:-?[eE-][0-9]{1,3})+)[^0-9]', 1.0, (0, -1)),

-                  # ... s03-x02 ...
-                  (r'[Ss](?P<season>[0-9]{1,2}).?(?P<bonusNumber>(?:[Xx][0-9]{1,2})+)[^0-9]', 1.0, (0, -1)),
+                  # ... s03-x02 ... # FIXME: redundant? remove it?
+                  #(r'[Ss](?P<season>[0-9]{1,3})[^0-9]?(?P<bonusNumber>(?:-?[xX-][0-9]{1,3})+)[^0-9]', 1.0, (0, -1)),

                  # ... 2x13 ...
-                  (r'[^0-9](?P<season>[0-9]{1,2}).?(?P<episodeNumber>(?:[xX][0-9]{1,2})+)[^0-9]', 0.8, (1, -1)),
+                  (r'[^0-9](?P<season>[0-9]{1,2})[^0-9]?(?P<episodeNumber>(?:-?[xX][0-9]{1,3})+)[^0-9]', 1.0, (1, -1)),

                  # ... s02 ...
                  #(sep + r's(?P<season>[0-9]{1,2})' + sep, 0.6, (1, -1)),
@@ -122,20 +122,25 @@ prop_multi = { 'format': { 'DVD': [ 'DVD', 'DVD-Rip', 'VIDEO-TS', 'DVDivX' ],
                           'VHS': [ 'VHS' ],
                           'WEB-DL': [ 'WEB-DL' ] },

-               'screenSize': { '480p': [ '480p?' ],
-                               '720p': [ '720p?' ],
-                               '1080p': [ '1080p?' ] },
+               'screenSize': { '480p': [ '480[pi]?' ],
+                               '720p': [ '720[pi]?' ],
+                               '1080p': [ '1080[pi]?' ] },

               'videoCodec': { 'XviD': [ 'Xvid' ],
                               'DivX': [ 'DVDivX', 'DivX' ],
                               'h264': [ '[hx]-264' ],
-                               'Rv10': [ 'Rv10' ] },
+                               'Rv10': [ 'Rv10' ],
+                               'Mpeg2': [ 'Mpeg2' ] },
+
+               # has nothing to do here (or on filenames for that matter), but some
+               # releases use it and it helps to identify release groups, so we adapt
+               'videoApi': {  'DXVA': [ 'DXVA' ] },

               'audioCodec': { 'AC3': [ 'AC3' ],
                               'DTS': [ 'DTS' ],
                               'AAC': [ 'He-AAC', 'AAC-He', 'AAC' ] },

-               'audioChannels': { '5.1': [ r'5\.1', 'DD5\.1', '5ch' ] },
+               'audioChannels': { '5.1': [ r'5\.1', 'DD5[\._ ]1', '5ch' ] },

               'episodeFormat': { 'Minisode': [ 'Minisodes?' ] }

@@ -143,14 +148,21 @@ prop_multi = { 'format': { 'DVD': [ 'DVD', 'DVD-Rip', 'VIDEO-TS', 'DVDivX' ],

 # prop_single dict of { property_name: [ canonical_form ] }
 prop_single = { 'releaseGroup': [ 'ESiR', 'WAF', 'SEPTiC', r'\[XCT\]', 'iNT', 'PUKKA',
-                                  'CHD', 'ViTE', 'TLF', 'DEiTY', 'FLAiTE',
-                                  'MDX', 'GM4F', 'DVL', 'SVD', 'iLUMiNADOS', 'FiNaLe',
-                                  'UnSeeN', 'aXXo', 'KLAXXON', 'NoTV', 'ZeaL', 'LOL',
-                                  'SiNNERS', 'DiRTY', 'REWARD', 'ECI', 'KiNGS', 'CLUE',
-                                  'CtrlHD', 'POD', 'WiKi', 'DIMENSION', 'IMMERSE', 'FQM',
-                                  '2HD', 'REPTiLE', 'CTU', 'HALCYON', 'EbP', 'SiTV',
-                                  'SAiNTS', 'HDBRiSe', 'AlFleNi-TeaM', 'EVOLVE', '0TV',
-                                  'TLA', 'NTB', 'ASAP', 'MOMENTUM', 'FoV', 'D-Z0N3' ],
+                                  'CHD', 'ViTE', 'TLF', 'FLAiTE',
+                                  'MDX', 'GM4F', 'DVL', 'SVD', 'iLUMiNADOS',
+                                  'aXXo', 'KLAXXON', 'NoTV', 'ZeaL', 'LOL',
+                                  'CtrlHD', 'POD', 'WiKi','IMMERSE', 'FQM',
+                                  '2HD',  'CTU', 'HALCYON', 'EbP', 'SiTV',
+                                  'HDBRiSe', 'AlFleNi-TeaM', 'EVOLVE', '0TV',
+                                  'TLA', 'NTB', 'ASAP', 'MOMENTUM', 'FoV', 'D-Z0N3',
+                                  'TrollHD', 'ECI'
+                                  ],
+
+                # potentially confusing release group names (they are words)
+                'weakReleaseGroup': [ 'DEiTY', 'FiNaLe', 'UnSeeN', 'KiNGS', 'CLUE', 'DIMENSION',
+                                      'SAiNTS', 'ARROW', 'EuReKA', 'SiNNERS', 'DiRTY', 'REWARD',
+                                      'REPTiLE',
+                                      ],

                'other': [ 'PROPER', 'REPACK', 'LIMITED', 'DualAudio', 'Audiofixed', 'R5',
                           'complete', 'classic', # not so sure about these ones, could appear in a title
@@ -179,6 +191,10 @@ properties_rexps.update(dict((type, dict((canonical_form, [ _to_rexp(canonical_f
 def find_properties(string):
    result = []
    for property_name, props in properties_rexps.items():
+        # FIXME: this should be done in a more flexible way...
+        if property_name in ['weakReleaseGroup']:
+            continue
+
        for canonical_form, rexps in props.items():
            for value_rexp in rexps:
                match = value_rexp.search(string)
--- a/libs/guessit/transfo/guess_episodes_rexps.py
+++ b/libs/guessit/transfo/guess_episodes_rexps.py
@@ -28,7 +28,13 @@ import logging
 log = logging.getLogger(__name__)

 def number_list(s):
-    return list(re.sub('[^0-9]+', ' ', s).split())
+    l = [ int(n) for n in re.sub('[^0-9]+', ' ', s).split() ]
+
+    if len(l) == 2:
+        # it is an episode interval, return all numbers in between
+        return range(l[0], l[1]+1)
+
+    return l

 def guess_episodes_rexps(string):
    for rexp, confidence, span_adjust in episode_rexps:
@@ -38,23 +44,23 @@ def guess_episodes_rexps(string):
            span = (match.start() + span_adjust[0],
                    match.end() + span_adjust[1])

-            # episodes which have a season > 25 are most likely errors
+            # episodes which have a season > 30 are most likely errors
            # (Simpsons is at 24!)
-            if int(guess.get('season', 0)) > 25:
+            if int(guess.get('season', 0)) > 30:
                continue

            # decide whether we have only a single episode number or an
            # episode list
            if guess.get('episodeNumber'):
                eplist = number_list(guess['episodeNumber'])
-                guess.set('episodeNumber', int(eplist[0]), confidence=confidence)
+                guess.set('episodeNumber', eplist[0], confidence=confidence)

                if len(eplist) > 1:
-                    guess.set('episodeList', list(map(int, eplist)), confidence=confidence)
+                    guess.set('episodeList', eplist, confidence=confidence)

            if guess.get('bonusNumber'):
                eplist = number_list(guess['bonusNumber'])
-                guess.set('bonusNumber', int(eplist[0]), confidence=confidence)
+                guess.set('bonusNumber', eplist[0], confidence=confidence)

            return guess, span

--- a/libs/guessit/transfo/guess_idnumber.py
+++ b/libs/guessit/transfo/guess_idnumber.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# GuessIt - A library for guessing information from filenames
+# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
+#
+# GuessIt is free software; you can redistribute it and/or modify it under
+# the terms of the Lesser GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# GuessIt is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# Lesser GNU General Public License for more details.
+#
+# You should have received a copy of the Lesser GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+from __future__ import unicode_literals
+from guessit.transfo import SingleNodeGuesser
+from guessit.patterns import find_properties
+import re
+import logging
+
+log = logging.getLogger(__name__)
+
+
+def guess_properties(string):
+    try:
+        prop, value, pos, end = find_properties(string)[0]
+        return { prop: value }, (pos, end)
+    except IndexError:
+        return None, None
+
+_idnum = re.compile(r'(?P<idNumber>[a-zA-Z0-9-]{10,})') # 1.0, (0, 0))
+
+def guess_idnumber(string):
+    match = _idnum.search(string)
+    if match is not None:
+        result = match.groupdict()
+        switch_count = 0
+        DIGIT = 0
+        LETTER = 1
+        OTHER = 2
+        last = LETTER
+        for c in result['idNumber']:
+            if c in '0123456789':
+                ci = DIGIT
+            elif c in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
+                ci = LETTER
+            else:
+                ci = OTHER
+
+            if ci != last:
+                switch_count += 1
+
+            last = ci
+
+        switch_ratio = float(switch_count) / len(result['idNumber'])
+
+        # only return the result as probable if we alternate often between
+        # char type (more likely for hash values than for common words)
+        if switch_ratio > 0.4:
+            return result, match.span()
+
+    return None, None
+
+def process(mtree):
+    SingleNodeGuesser(guess_idnumber, 0.4, log).process(mtree)
--- a/libs/guessit/transfo/guess_release_group.py
+++ b/libs/guessit/transfo/guess_release_group.py
@@ -31,16 +31,22 @@ def get_patterns(property_name):

 CODECS = get_patterns('videoCodec')
 FORMATS = get_patterns('format')
+VAPIS = get_patterns('videoApi')

-GROUP_NAMES = [ r'(?P<videoCodec>' + codec + r')-?(?P<releaseGroup>.*?)[ \.]'
+# RG names following a codec or format, with a potential space or dash inside the name
+GROUP_NAMES = [ r'(?P<videoCodec>' + codec + r')[ \.-](?P<releaseGroup>.+?([- \.].*?)??)[ \.]'
                for codec in CODECS ]
-GROUP_NAMES += [ r'(?P<format>' + fmt + r')-?(?P<releaseGroup>.*?)[ \.]'
+GROUP_NAMES += [ r'(?P<format>'    + fmt   + r')[ \.-](?P<releaseGroup>.+?([- \.].*?)??)[ \.]'
                 for fmt in FORMATS ]
+GROUP_NAMES += [ r'(?P<videoApi>'  + api   + r')[ \.-](?P<releaseGroup>.+?([- \.].*?)??)[ \.]'
+                 for api in VAPIS ]

 GROUP_NAMES2 = [ r'\.(?P<videoCodec>' + codec + r')-(?P<releaseGroup>.*?)(-(.*?))?[ \.]'
                 for codec in CODECS ]
-GROUP_NAMES2 += [ r'\.(?P<format>' + fmt + r')-(?P<releaseGroup>.*?)(-(.*?))?[ \.]'
+GROUP_NAMES2 += [ r'\.(?P<format>'    + fmt   + r')-(?P<releaseGroup>.*?)(-(.*?))?[ \.]'
                  for fmt in FORMATS ]
+GROUP_NAMES2 += [ r'\.(?P<videoApi>'  + vapi  + r')-(?P<releaseGroup>.*?)(-(.*?))?[ \.]'
+                  for vapi in VAPIS ]

 GROUP_NAMES = [ re.compile(r, re.IGNORECASE) for r in GROUP_NAMES ]
 GROUP_NAMES2 = [ re.compile(r, re.IGNORECASE) for r in GROUP_NAMES2 ]
@@ -54,12 +60,17 @@ def guess_release_group(string):
    # first try to see whether we have both a known codec and a known release group
    for rexp in GROUP_NAMES:
        match = rexp.search(string)
-        if match:
+        while match:
            metadata = match.groupdict()
-            release_group = compute_canonical_form('releaseGroup', metadata['releaseGroup'])
+            # make sure this is an actual release group we caught
+            release_group = (compute_canonical_form('releaseGroup', metadata['releaseGroup']) or
+                             compute_canonical_form('weakReleaseGroup', metadata['releaseGroup']))
            if release_group:
                return adjust_metadata(metadata), (match.start(1), match.end(2))

+            # we didn't find anything conclusive, keep searching
+            match = rexp.search(string, match.span()[0]+1)
+
    # pick anything as releaseGroup as long as we have a codec in front
    # this doesn't include a potential dash ('-') ending the release group
    # eg: [...].X264-HiS@SiLUHD-English.[...]
--- a/libs/guessit/transfo/guess_year.py
+++ b/libs/guessit/transfo/guess_year.py
@@ -33,6 +33,18 @@ def guess_year(string):
    else:
        return None, None

+def guess_year_skip_first(string):
+    year, span = search_year(string)
+    if year:
+        year2, span2 = guess_year(string[span[1]:])
+        if year2:
+            return year2, (span2[0]+span[1], span2[1]+span[1])

-def process(mtree):
-    SingleNodeGuesser(guess_year, 1.0, log).process(mtree)
+    return None, None
+
+
+def process(mtree, skip_first_year=False):
+    if skip_first_year:
+        SingleNodeGuesser(guess_year_skip_first, 1.0, log).process(mtree)
+    else:
+        SingleNodeGuesser(guess_year, 1.0, log).process(mtree)