Update GuessIt
This commit is contained in:
@@ -20,7 +20,7 @@
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
__version__ = '0.6-dev'
|
||||
__version__ = '0.7-dev'
|
||||
__all__ = ['Guess', 'Language',
|
||||
'guess_file_info', 'guess_video_info',
|
||||
'guess_movie_info', 'guess_episode_info']
|
||||
@@ -91,7 +91,28 @@ log.addHandler(h)
|
||||
|
||||
|
||||
def _guess_filename(filename, filetype):
|
||||
def find_nodes(tree, props):
|
||||
"""Yields all nodes containing any of the given props."""
|
||||
if isinstance(props, base_text_type):
|
||||
props = [props]
|
||||
for node in tree.nodes():
|
||||
if any(prop in node.guess for prop in props):
|
||||
yield node
|
||||
|
||||
def warning(title):
|
||||
log.warning('%s, guesses: %s - %s' % (title, m.nice_string(), m2.nice_string()))
|
||||
return m
|
||||
|
||||
mtree = IterativeMatcher(filename, filetype=filetype)
|
||||
|
||||
# if there are multiple possible years found, we assume the first one is
|
||||
# part of the title, reparse the tree taking this into account
|
||||
years = set(n.value for n in find_nodes(mtree.match_tree, 'year'))
|
||||
if len(years) >= 2:
|
||||
mtree = IterativeMatcher(filename, filetype=filetype,
|
||||
opts=['skip_first_year'])
|
||||
|
||||
|
||||
m = mtree.matched()
|
||||
|
||||
if 'language' not in m and 'subtitleLanguage' not in m:
|
||||
@@ -102,20 +123,10 @@ def _guess_filename(filename, filetype):
|
||||
opts=['nolanguage', 'nocountry'])
|
||||
m2 = mtree2.matched()
|
||||
|
||||
def find_nodes(tree, props):
|
||||
"""Yields all nodes containing any of the given props."""
|
||||
if isinstance(props, base_text_type):
|
||||
props = [props]
|
||||
for node in tree.nodes():
|
||||
if any(prop in node.guess for prop in props):
|
||||
yield node
|
||||
|
||||
|
||||
def warning(title):
|
||||
log.warning('%s, guesses: %s - %s' % (title, m.nice_string(), m2.nice_string()))
|
||||
if m.get('title') is None:
|
||||
return m
|
||||
|
||||
|
||||
if m.get('title') != m2.get('title'):
|
||||
title = next(find_nodes(mtree.match_tree, 'title'))
|
||||
title2 = next(find_nodes(mtree2.match_tree, 'title'))
|
||||
|
||||
@@ -77,12 +77,12 @@ def file_in_same_dir(ref_file, desired_file):
|
||||
|
||||
def load_file_in_same_dir(ref_file, filename):
|
||||
"""Load a given file. Works even when the file is contained inside a zip."""
|
||||
path = split_path(ref_file)[:-1] + [str(filename)]
|
||||
path = split_path(ref_file)[:-1] + [filename]
|
||||
|
||||
for i, p in enumerate(path):
|
||||
if p[-4:] == '.zip':
|
||||
if p.endswith('.zip'):
|
||||
zfilename = os.path.join(*path[:i + 1])
|
||||
zfile = zipfile.ZipFile(zfilename)
|
||||
return zfile.read('/'.join(path[i + 1:]))
|
||||
|
||||
return u(io.open(os.path.join(*path), encoding = 'utf-8').read())
|
||||
return u(io.open(os.path.join(*path), encoding='utf-8').read())
|
||||
|
||||
@@ -295,7 +295,7 @@ def merge_all(guesses, append=None):
|
||||
# then merge the remaining ones
|
||||
dups = set(result) & set(g)
|
||||
if dups:
|
||||
log.warning('duplicate properties %s in merged result...' % dups)
|
||||
log.warning('duplicate properties %s in merged result...' % [ (result[p], g[p]) for p in dups] )
|
||||
|
||||
result.update_highest_confidence(g)
|
||||
|
||||
|
||||
@@ -326,7 +326,7 @@ def search_language(string, lang_filter=None):
|
||||
'la', 'el', 'del', 'por', 'mar',
|
||||
# other
|
||||
'ind', 'arw', 'ts', 'ii', 'bin', 'chan', 'ss', 'san', 'oss', 'iii',
|
||||
'vi', 'ben', 'da'
|
||||
'vi', 'ben', 'da', 'lt'
|
||||
])
|
||||
sep = r'[](){} \._-+'
|
||||
|
||||
|
||||
@@ -128,12 +128,14 @@ class IterativeMatcher(object):
|
||||
apply_transfo(name)
|
||||
|
||||
# more guessers for both movies and episodes
|
||||
for name in ['guess_bonus_features', 'guess_year']:
|
||||
apply_transfo(name)
|
||||
apply_transfo('guess_bonus_features')
|
||||
apply_transfo('guess_year', skip_first_year=('skip_first_year' in opts))
|
||||
|
||||
if 'nocountry' not in opts:
|
||||
apply_transfo('guess_country')
|
||||
|
||||
apply_transfo('guess_idnumber')
|
||||
|
||||
|
||||
# split into '-' separated subgroups (with required separator chars
|
||||
# around the dash)
|
||||
|
||||
@@ -275,7 +275,7 @@ class MatchTree(BaseMatchTree):
|
||||
for string_part in ('title', 'series', 'container', 'format',
|
||||
'releaseGroup', 'website', 'audioCodec',
|
||||
'videoCodec', 'screenSize', 'episodeFormat',
|
||||
'audioChannels'):
|
||||
'audioChannels', 'idNumber'):
|
||||
merge_similar_guesses(parts, string_part, choose_string)
|
||||
|
||||
# 2- merge the rest, potentially discarding information not properly
|
||||
|
||||
@@ -43,13 +43,13 @@ episode_rexps = [ # ... Season 2 ...
|
||||
(r'saison (?P<season>[0-9]+)', 1.0, (0, 0)),
|
||||
|
||||
# ... s02e13 ...
|
||||
(r'[Ss](?P<season>[0-9]{1,2}).?(?P<episodeNumber>(?:[Ee-][0-9]{1,2})+)[^0-9]', 1.0, (0, -1)),
|
||||
(r'[Ss](?P<season>[0-9]{1,3})[^0-9]?(?P<episodeNumber>(?:-?[eE-][0-9]{1,3})+)[^0-9]', 1.0, (0, -1)),
|
||||
|
||||
# ... s03-x02 ...
|
||||
(r'[Ss](?P<season>[0-9]{1,2}).?(?P<bonusNumber>(?:[Xx][0-9]{1,2})+)[^0-9]', 1.0, (0, -1)),
|
||||
# ... s03-x02 ... # FIXME: redundant? remove it?
|
||||
#(r'[Ss](?P<season>[0-9]{1,3})[^0-9]?(?P<bonusNumber>(?:-?[xX-][0-9]{1,3})+)[^0-9]', 1.0, (0, -1)),
|
||||
|
||||
# ... 2x13 ...
|
||||
(r'[^0-9](?P<season>[0-9]{1,2}).?(?P<episodeNumber>(?:[xX][0-9]{1,2})+)[^0-9]', 0.8, (1, -1)),
|
||||
(r'[^0-9](?P<season>[0-9]{1,2})[^0-9]?(?P<episodeNumber>(?:-?[xX][0-9]{1,3})+)[^0-9]', 1.0, (1, -1)),
|
||||
|
||||
# ... s02 ...
|
||||
#(sep + r's(?P<season>[0-9]{1,2})' + sep, 0.6, (1, -1)),
|
||||
@@ -122,20 +122,25 @@ prop_multi = { 'format': { 'DVD': [ 'DVD', 'DVD-Rip', 'VIDEO-TS', 'DVDivX' ],
|
||||
'VHS': [ 'VHS' ],
|
||||
'WEB-DL': [ 'WEB-DL' ] },
|
||||
|
||||
'screenSize': { '480p': [ '480p?' ],
|
||||
'720p': [ '720p?' ],
|
||||
'1080p': [ '1080p?' ] },
|
||||
'screenSize': { '480p': [ '480[pi]?' ],
|
||||
'720p': [ '720[pi]?' ],
|
||||
'1080p': [ '1080[pi]?' ] },
|
||||
|
||||
'videoCodec': { 'XviD': [ 'Xvid' ],
|
||||
'DivX': [ 'DVDivX', 'DivX' ],
|
||||
'h264': [ '[hx]-264' ],
|
||||
'Rv10': [ 'Rv10' ] },
|
||||
'Rv10': [ 'Rv10' ],
|
||||
'Mpeg2': [ 'Mpeg2' ] },
|
||||
|
||||
# has nothing to do here (or on filenames for that matter), but some
|
||||
# releases use it and it helps to identify release groups, so we adapt
|
||||
'videoApi': { 'DXVA': [ 'DXVA' ] },
|
||||
|
||||
'audioCodec': { 'AC3': [ 'AC3' ],
|
||||
'DTS': [ 'DTS' ],
|
||||
'AAC': [ 'He-AAC', 'AAC-He', 'AAC' ] },
|
||||
|
||||
'audioChannels': { '5.1': [ r'5\.1', 'DD5\.1', '5ch' ] },
|
||||
'audioChannels': { '5.1': [ r'5\.1', 'DD5[\._ ]1', '5ch' ] },
|
||||
|
||||
'episodeFormat': { 'Minisode': [ 'Minisodes?' ] }
|
||||
|
||||
@@ -143,14 +148,21 @@ prop_multi = { 'format': { 'DVD': [ 'DVD', 'DVD-Rip', 'VIDEO-TS', 'DVDivX' ],
|
||||
|
||||
# prop_single dict of { property_name: [ canonical_form ] }
|
||||
prop_single = { 'releaseGroup': [ 'ESiR', 'WAF', 'SEPTiC', r'\[XCT\]', 'iNT', 'PUKKA',
|
||||
'CHD', 'ViTE', 'TLF', 'DEiTY', 'FLAiTE',
|
||||
'MDX', 'GM4F', 'DVL', 'SVD', 'iLUMiNADOS', 'FiNaLe',
|
||||
'UnSeeN', 'aXXo', 'KLAXXON', 'NoTV', 'ZeaL', 'LOL',
|
||||
'SiNNERS', 'DiRTY', 'REWARD', 'ECI', 'KiNGS', 'CLUE',
|
||||
'CtrlHD', 'POD', 'WiKi', 'DIMENSION', 'IMMERSE', 'FQM',
|
||||
'2HD', 'REPTiLE', 'CTU', 'HALCYON', 'EbP', 'SiTV',
|
||||
'SAiNTS', 'HDBRiSe', 'AlFleNi-TeaM', 'EVOLVE', '0TV',
|
||||
'TLA', 'NTB', 'ASAP', 'MOMENTUM', 'FoV', 'D-Z0N3' ],
|
||||
'CHD', 'ViTE', 'TLF', 'FLAiTE',
|
||||
'MDX', 'GM4F', 'DVL', 'SVD', 'iLUMiNADOS',
|
||||
'aXXo', 'KLAXXON', 'NoTV', 'ZeaL', 'LOL',
|
||||
'CtrlHD', 'POD', 'WiKi','IMMERSE', 'FQM',
|
||||
'2HD', 'CTU', 'HALCYON', 'EbP', 'SiTV',
|
||||
'HDBRiSe', 'AlFleNi-TeaM', 'EVOLVE', '0TV',
|
||||
'TLA', 'NTB', 'ASAP', 'MOMENTUM', 'FoV', 'D-Z0N3',
|
||||
'TrollHD', 'ECI'
|
||||
],
|
||||
|
||||
# potentially confusing release group names (they are words)
|
||||
'weakReleaseGroup': [ 'DEiTY', 'FiNaLe', 'UnSeeN', 'KiNGS', 'CLUE', 'DIMENSION',
|
||||
'SAiNTS', 'ARROW', 'EuReKA', 'SiNNERS', 'DiRTY', 'REWARD',
|
||||
'REPTiLE',
|
||||
],
|
||||
|
||||
'other': [ 'PROPER', 'REPACK', 'LIMITED', 'DualAudio', 'Audiofixed', 'R5',
|
||||
'complete', 'classic', # not so sure about these ones, could appear in a title
|
||||
@@ -179,6 +191,10 @@ properties_rexps.update(dict((type, dict((canonical_form, [ _to_rexp(canonical_f
|
||||
def find_properties(string):
|
||||
result = []
|
||||
for property_name, props in properties_rexps.items():
|
||||
# FIXME: this should be done in a more flexible way...
|
||||
if property_name in ['weakReleaseGroup']:
|
||||
continue
|
||||
|
||||
for canonical_form, rexps in props.items():
|
||||
for value_rexp in rexps:
|
||||
match = value_rexp.search(string)
|
||||
|
||||
@@ -28,7 +28,13 @@ import logging
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
def number_list(s):
|
||||
return list(re.sub('[^0-9]+', ' ', s).split())
|
||||
l = [ int(n) for n in re.sub('[^0-9]+', ' ', s).split() ]
|
||||
|
||||
if len(l) == 2:
|
||||
# it is an episode interval, return all numbers in between
|
||||
return range(l[0], l[1]+1)
|
||||
|
||||
return l
|
||||
|
||||
def guess_episodes_rexps(string):
|
||||
for rexp, confidence, span_adjust in episode_rexps:
|
||||
@@ -38,23 +44,23 @@ def guess_episodes_rexps(string):
|
||||
span = (match.start() + span_adjust[0],
|
||||
match.end() + span_adjust[1])
|
||||
|
||||
# episodes which have a season > 25 are most likely errors
|
||||
# episodes which have a season > 30 are most likely errors
|
||||
# (Simpsons is at 24!)
|
||||
if int(guess.get('season', 0)) > 25:
|
||||
if int(guess.get('season', 0)) > 30:
|
||||
continue
|
||||
|
||||
# decide whether we have only a single episode number or an
|
||||
# episode list
|
||||
if guess.get('episodeNumber'):
|
||||
eplist = number_list(guess['episodeNumber'])
|
||||
guess.set('episodeNumber', int(eplist[0]), confidence=confidence)
|
||||
guess.set('episodeNumber', eplist[0], confidence=confidence)
|
||||
|
||||
if len(eplist) > 1:
|
||||
guess.set('episodeList', list(map(int, eplist)), confidence=confidence)
|
||||
guess.set('episodeList', eplist, confidence=confidence)
|
||||
|
||||
if guess.get('bonusNumber'):
|
||||
eplist = number_list(guess['bonusNumber'])
|
||||
guess.set('bonusNumber', int(eplist[0]), confidence=confidence)
|
||||
guess.set('bonusNumber', eplist[0], confidence=confidence)
|
||||
|
||||
return guess, span
|
||||
|
||||
|
||||
71
libs/guessit/transfo/guess_idnumber.py
Executable file
71
libs/guessit/transfo/guess_idnumber.py
Executable file
@@ -0,0 +1,71 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# GuessIt - A library for guessing information from filenames
|
||||
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
|
||||
#
|
||||
# GuessIt is free software; you can redistribute it and/or modify it under
|
||||
# the terms of the Lesser GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# GuessIt is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# Lesser GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the Lesser GNU General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
#
|
||||
|
||||
from __future__ import unicode_literals
|
||||
from guessit.transfo import SingleNodeGuesser
|
||||
from guessit.patterns import find_properties
|
||||
import re
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def guess_properties(string):
|
||||
try:
|
||||
prop, value, pos, end = find_properties(string)[0]
|
||||
return { prop: value }, (pos, end)
|
||||
except IndexError:
|
||||
return None, None
|
||||
|
||||
_idnum = re.compile(r'(?P<idNumber>[a-zA-Z0-9-]{10,})') # 1.0, (0, 0))
|
||||
|
||||
def guess_idnumber(string):
|
||||
match = _idnum.search(string)
|
||||
if match is not None:
|
||||
result = match.groupdict()
|
||||
switch_count = 0
|
||||
DIGIT = 0
|
||||
LETTER = 1
|
||||
OTHER = 2
|
||||
last = LETTER
|
||||
for c in result['idNumber']:
|
||||
if c in '0123456789':
|
||||
ci = DIGIT
|
||||
elif c in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
|
||||
ci = LETTER
|
||||
else:
|
||||
ci = OTHER
|
||||
|
||||
if ci != last:
|
||||
switch_count += 1
|
||||
|
||||
last = ci
|
||||
|
||||
switch_ratio = float(switch_count) / len(result['idNumber'])
|
||||
|
||||
# only return the result as probable if we alternate often between
|
||||
# char type (more likely for hash values than for common words)
|
||||
if switch_ratio > 0.4:
|
||||
return result, match.span()
|
||||
|
||||
return None, None
|
||||
|
||||
def process(mtree):
|
||||
SingleNodeGuesser(guess_idnumber, 0.4, log).process(mtree)
|
||||
@@ -31,16 +31,22 @@ def get_patterns(property_name):
|
||||
|
||||
CODECS = get_patterns('videoCodec')
|
||||
FORMATS = get_patterns('format')
|
||||
VAPIS = get_patterns('videoApi')
|
||||
|
||||
GROUP_NAMES = [ r'(?P<videoCodec>' + codec + r')-?(?P<releaseGroup>.*?)[ \.]'
|
||||
# RG names following a codec or format, with a potential space or dash inside the name
|
||||
GROUP_NAMES = [ r'(?P<videoCodec>' + codec + r')[ \.-](?P<releaseGroup>.+?([- \.].*?)??)[ \.]'
|
||||
for codec in CODECS ]
|
||||
GROUP_NAMES += [ r'(?P<format>' + fmt + r')-?(?P<releaseGroup>.*?)[ \.]'
|
||||
GROUP_NAMES += [ r'(?P<format>' + fmt + r')[ \.-](?P<releaseGroup>.+?([- \.].*?)??)[ \.]'
|
||||
for fmt in FORMATS ]
|
||||
GROUP_NAMES += [ r'(?P<videoApi>' + api + r')[ \.-](?P<releaseGroup>.+?([- \.].*?)??)[ \.]'
|
||||
for api in VAPIS ]
|
||||
|
||||
GROUP_NAMES2 = [ r'\.(?P<videoCodec>' + codec + r')-(?P<releaseGroup>.*?)(-(.*?))?[ \.]'
|
||||
for codec in CODECS ]
|
||||
GROUP_NAMES2 += [ r'\.(?P<format>' + fmt + r')-(?P<releaseGroup>.*?)(-(.*?))?[ \.]'
|
||||
GROUP_NAMES2 += [ r'\.(?P<format>' + fmt + r')-(?P<releaseGroup>.*?)(-(.*?))?[ \.]'
|
||||
for fmt in FORMATS ]
|
||||
GROUP_NAMES2 += [ r'\.(?P<videoApi>' + vapi + r')-(?P<releaseGroup>.*?)(-(.*?))?[ \.]'
|
||||
for vapi in VAPIS ]
|
||||
|
||||
GROUP_NAMES = [ re.compile(r, re.IGNORECASE) for r in GROUP_NAMES ]
|
||||
GROUP_NAMES2 = [ re.compile(r, re.IGNORECASE) for r in GROUP_NAMES2 ]
|
||||
@@ -54,12 +60,17 @@ def guess_release_group(string):
|
||||
# first try to see whether we have both a known codec and a known release group
|
||||
for rexp in GROUP_NAMES:
|
||||
match = rexp.search(string)
|
||||
if match:
|
||||
while match:
|
||||
metadata = match.groupdict()
|
||||
release_group = compute_canonical_form('releaseGroup', metadata['releaseGroup'])
|
||||
# make sure this is an actual release group we caught
|
||||
release_group = (compute_canonical_form('releaseGroup', metadata['releaseGroup']) or
|
||||
compute_canonical_form('weakReleaseGroup', metadata['releaseGroup']))
|
||||
if release_group:
|
||||
return adjust_metadata(metadata), (match.start(1), match.end(2))
|
||||
|
||||
# we didn't find anything conclusive, keep searching
|
||||
match = rexp.search(string, match.span()[0]+1)
|
||||
|
||||
# pick anything as releaseGroup as long as we have a codec in front
|
||||
# this doesn't include a potential dash ('-') ending the release group
|
||||
# eg: [...].X264-HiS@SiLUHD-English.[...]
|
||||
|
||||
@@ -33,6 +33,18 @@ def guess_year(string):
|
||||
else:
|
||||
return None, None
|
||||
|
||||
def guess_year_skip_first(string):
|
||||
year, span = search_year(string)
|
||||
if year:
|
||||
year2, span2 = guess_year(string[span[1]:])
|
||||
if year2:
|
||||
return year2, (span2[0]+span[1], span2[1]+span[1])
|
||||
|
||||
def process(mtree):
|
||||
SingleNodeGuesser(guess_year, 1.0, log).process(mtree)
|
||||
return None, None
|
||||
|
||||
|
||||
def process(mtree, skip_first_year=False):
|
||||
if skip_first_year:
|
||||
SingleNodeGuesser(guess_year_skip_first, 1.0, log).process(mtree)
|
||||
else:
|
||||
SingleNodeGuesser(guess_year, 1.0, log).process(mtree)
|
||||
|
||||
Reference in New Issue
Block a user