Update guessit with unicode fix
This commit is contained in:
0
libs/guessit/ISO-3166-1_utf8.txt
Normal file → Executable file
0
libs/guessit/ISO-3166-1_utf8.txt
Normal file → Executable file
0
libs/guessit/ISO-639-2_utf-8.txt
Normal file → Executable file
0
libs/guessit/ISO-639-2_utf-8.txt
Normal file → Executable file
92
libs/guessit/__init__.py
Normal file → Executable file
92
libs/guessit/__init__.py
Normal file → Executable file
@@ -18,8 +18,9 @@
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
#
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
__version__ = '0.5.2'
|
||||
__version__ = '0.6-dev'
|
||||
__all__ = ['Guess', 'Language',
|
||||
'guess_file_info', 'guess_video_info',
|
||||
'guess_movie_info', 'guess_episode_info']
|
||||
@@ -73,6 +74,7 @@ else:
|
||||
from guessit.guess import Guess, merge_all
|
||||
from guessit.language import Language
|
||||
from guessit.matcher import IterativeMatcher
|
||||
from guessit.textutils import clean_string
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
@@ -88,6 +90,86 @@ h = NullHandler()
|
||||
log.addHandler(h)
|
||||
|
||||
|
||||
def _guess_filename(filename, filetype):
|
||||
mtree = IterativeMatcher(filename, filetype=filetype)
|
||||
m = mtree.matched()
|
||||
|
||||
if 'language' not in m and 'subtitleLanguage' not in m:
|
||||
return m
|
||||
|
||||
# if we found some language, make sure we didn't cut a title or sth...
|
||||
mtree2 = IterativeMatcher(filename, filetype=filetype,
|
||||
opts=['nolanguage', 'nocountry'])
|
||||
m2 = mtree2.matched()
|
||||
|
||||
def find_nodes(tree, props):
|
||||
"""Yields all nodes containing any of the given props."""
|
||||
if isinstance(props, base_text_type):
|
||||
props = [props]
|
||||
for node in tree.nodes():
|
||||
if any(prop in node.guess for prop in props):
|
||||
yield node
|
||||
|
||||
|
||||
def warning(title):
|
||||
log.warning('%s, guesses: %s - %s' % (title, m.nice_string(), m2.nice_string()))
|
||||
return m
|
||||
|
||||
|
||||
if m.get('title') != m2.get('title'):
|
||||
title = next(find_nodes(mtree.match_tree, 'title'))
|
||||
title2 = next(find_nodes(mtree2.match_tree, 'title'))
|
||||
|
||||
langs = list(find_nodes(mtree.match_tree, ['language', 'subtitleLanguage']))
|
||||
if not langs:
|
||||
return warning('A weird error happened with language detection')
|
||||
|
||||
# find the language that is likely more relevant
|
||||
for lng in langs:
|
||||
if lng.value in title2.value:
|
||||
# if the language was detected as part of a potential title,
|
||||
# look at this one in particular
|
||||
lang = lng
|
||||
break
|
||||
else:
|
||||
# pick the first one if we don't have a better choice
|
||||
lang = langs[0]
|
||||
|
||||
|
||||
# language code are rarely part of a title, and those
|
||||
# should be handled by the Language exceptions anyway
|
||||
if len(lang.value) <= 3:
|
||||
return m
|
||||
|
||||
|
||||
# if filetype is subtitle and the language appears last, just before
|
||||
# the extension, then it is likely a subtitle language
|
||||
parts = clean_string(title.root.value).split()
|
||||
if (m['type'] in ['moviesubtitle', 'episodesubtitle'] and
|
||||
parts.index(lang.value) == len(parts) - 2):
|
||||
return m
|
||||
|
||||
# if the language was in the middle of the other potential title,
|
||||
# keep the other title (eg: The Italian Job), except if it is at the
|
||||
# very beginning, in which case we consider it an error
|
||||
if m2['title'].startswith(lang.value):
|
||||
return m
|
||||
elif lang.value in title2.value:
|
||||
return m2
|
||||
|
||||
# if a node is in an explicit group, then the correct title is probably
|
||||
# the other one
|
||||
if title.root.node_at(title.node_idx[:2]).is_explicit():
|
||||
return m2
|
||||
elif title2.root.node_at(title2.node_idx[:2]).is_explicit():
|
||||
return m
|
||||
|
||||
return warning('Not sure of the title because of the language position')
|
||||
|
||||
|
||||
return m
|
||||
|
||||
|
||||
def guess_file_info(filename, filetype, info=None):
|
||||
"""info can contain the names of the various plugins, such as 'filename' to
|
||||
detect filename info, or 'hash_md5' to get the md5 hash of the file.
|
||||
@@ -98,6 +180,9 @@ def guess_file_info(filename, filetype, info=None):
|
||||
result = []
|
||||
hashers = []
|
||||
|
||||
# Force unicode as soon as possible
|
||||
filename = u(filename)
|
||||
|
||||
if info is None:
|
||||
info = ['filename']
|
||||
|
||||
@@ -106,8 +191,7 @@ def guess_file_info(filename, filetype, info=None):
|
||||
|
||||
for infotype in info:
|
||||
if infotype == 'filename':
|
||||
m = IterativeMatcher(filename, filetype=filetype)
|
||||
result.append(m.matched())
|
||||
result.append(_guess_filename(filename, filetype))
|
||||
|
||||
elif infotype == 'hash_mpc':
|
||||
from guessit.hash_mpc import hash_file
|
||||
@@ -161,7 +245,7 @@ def guess_file_info(filename, filetype, info=None):
|
||||
# last minute adjustments
|
||||
|
||||
# if country is in the guessed properties, make it part of the filename
|
||||
if 'country' in result:
|
||||
if 'series' in result and 'country' in result:
|
||||
result['series'] += ' (%s)' % result['country'].alpha2.upper()
|
||||
|
||||
|
||||
|
||||
115
libs/guessit/__main__.py
Executable file
115
libs/guessit/__main__.py
Executable file
@@ -0,0 +1,115 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# GuessIt - A library for guessing information from filenames
|
||||
# Copyright (c) 2011 Nicolas Wack <wackou@gmail.com>
|
||||
#
|
||||
# GuessIt is free software; you can redistribute it and/or modify it under
|
||||
# the terms of the Lesser GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# GuessIt is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# Lesser GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the Lesser GNU General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
#
|
||||
|
||||
from __future__ import unicode_literals
|
||||
from __future__ import print_function
|
||||
from guessit import u
|
||||
from guessit import slogging, guess_file_info
|
||||
from optparse import OptionParser
|
||||
import logging
|
||||
|
||||
|
||||
def detect_filename(filename, filetype, info=['filename']):
|
||||
filename = u(filename)
|
||||
|
||||
print('For:', filename)
|
||||
print('GuessIt found:', guess_file_info(filename, filetype, info).nice_string())
|
||||
|
||||
|
||||
def run_demo(episodes=True, movies=True):
|
||||
# NOTE: tests should not be added here but rather in the tests/ folder
|
||||
# this is just intended as a quick example
|
||||
if episodes:
|
||||
testeps = [ 'Series/Californication/Season 2/Californication.2x05.Vaginatown.HDTV.XviD-0TV.[tvu.org.ru].avi',
|
||||
'Series/dexter/Dexter.5x02.Hello,.Bandit.ENG.-.sub.FR.HDTV.XviD-AlFleNi-TeaM.[tvu.org.ru].avi',
|
||||
'Series/Treme/Treme.1x03.Right.Place,.Wrong.Time.HDTV.XviD-NoTV.[tvu.org.ru].avi',
|
||||
'Series/Duckman/Duckman - 101 (01) - 20021107 - I, Duckman.avi',
|
||||
'Series/Duckman/Duckman - S1E13 Joking The Chicken (unedited).avi',
|
||||
'Series/Simpsons/The_simpsons_s13e18_-_i_am_furious_yellow.mpg',
|
||||
'Series/Simpsons/Saison 12 Français/Simpsons,.The.12x08.A.Bas.Le.Sergent.Skinner.FR.[tvu.org.ru].avi',
|
||||
'Series/Dr._Slump_-_002_DVB-Rip_Catalan_by_kelf.avi',
|
||||
'Series/Kaamelott/Kaamelott - Livre V - Second Volet - HD 704x396 Xvid 2 pass - Son 5.1 - TntRip by Slurm.avi'
|
||||
]
|
||||
|
||||
for f in testeps:
|
||||
print('-'*80)
|
||||
detect_filename(f, filetype='episode')
|
||||
|
||||
|
||||
if movies:
|
||||
testmovies = [ 'Movies/Fear and Loathing in Las Vegas (1998)/Fear.and.Loathing.in.Las.Vegas.720p.HDDVD.DTS.x264-ESiR.mkv',
|
||||
'Movies/El Dia de la Bestia (1995)/El.dia.de.la.bestia.DVDrip.Spanish.DivX.by.Artik[SEDG].avi',
|
||||
'Movies/Blade Runner (1982)/Blade.Runner.(1982).(Director\'s.Cut).CD1.DVDRip.XviD.AC3-WAF.avi',
|
||||
'Movies/Dark City (1998)/Dark.City.(1998).DC.BDRip.720p.DTS.X264-CHD.mkv',
|
||||
'Movies/Sin City (BluRay) (2005)/Sin.City.2005.BDRip.720p.x264.AC3-SEPTiC.mkv',
|
||||
'Movies/Borat (2006)/Borat.(2006).R5.PROPER.REPACK.DVDRip.XviD-PUKKA.avi', # FIXME: PROPER and R5 get overwritten
|
||||
'[XCT].Le.Prestige.(The.Prestige).DVDRip.[x264.HP.He-Aac.{Fr-Eng}.St{Fr-Eng}.Chaps].mkv', # FIXME: title gets overwritten
|
||||
'Battle Royale (2000)/Battle.Royale.(Batoru.Rowaiaru).(2000).(Special.Edition).CD1of2.DVDRiP.XviD-[ZeaL].avi',
|
||||
'Movies/Brazil (1985)/Brazil_Criterion_Edition_(1985).CD2.English.srt',
|
||||
'Movies/Persepolis (2007)/[XCT] Persepolis [H264+Aac-128(Fr-Eng)+ST(Fr-Eng)+Ind].mkv',
|
||||
'Movies/Toy Story (1995)/Toy Story [HDTV 720p English-Spanish].mkv',
|
||||
'Movies/Pirates of the Caribbean: The Curse of the Black Pearl (2003)/Pirates.Of.The.Carribean.DC.2003.iNT.DVDRip.XviD.AC3-NDRT.CD1.avi',
|
||||
'Movies/Office Space (1999)/Office.Space.[Dual-DVDRip].[Spanish-English].[XviD-AC3-AC3].[by.Oswald].avi',
|
||||
'Movies/The NeverEnding Story (1984)/The.NeverEnding.Story.1.1984.DVDRip.AC3.Xvid-Monteque.avi',
|
||||
'Movies/Juno (2007)/Juno KLAXXON.avi',
|
||||
'Movies/Chat noir, chat blanc (1998)/Chat noir, Chat blanc - Emir Kusturica (VO - VF - sub FR - Chapters).mkv',
|
||||
'Movies/Wild Zero (2000)/Wild.Zero.DVDivX-EPiC.srt',
|
||||
'Movies/El Bosque Animado (1987)/El.Bosque.Animado.[Jose.Luis.Cuerda.1987].[Xvid-Dvdrip-720x432].avi',
|
||||
'testsmewt_bugs/movies/Baraka_Edition_Collector.avi'
|
||||
]
|
||||
|
||||
for f in testmovies:
|
||||
print('-'*80)
|
||||
detect_filename(f, filetype = 'movie')
|
||||
|
||||
|
||||
def main():
|
||||
slogging.setupLogging()
|
||||
|
||||
parser = OptionParser(usage = 'usage: %prog [options] file1 [file2...]')
|
||||
parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False,
|
||||
help = 'display debug output')
|
||||
parser.add_option('-i', '--info', dest = 'info', default = 'filename',
|
||||
help = 'the desired information type: filename, hash_mpc or a hash from python\'s '
|
||||
'hashlib module, such as hash_md5, hash_sha1, ...; or a list of any of '
|
||||
'them, comma-separated')
|
||||
parser.add_option('-t', '--type', dest = 'filetype', default = 'autodetect',
|
||||
help = 'the suggested file type: movie, episode or autodetect')
|
||||
parser.add_option('-d', '--demo', action='store_true', dest='demo', default=False,
|
||||
help = 'run a few builtin tests instead of analyzing a file')
|
||||
|
||||
options, args = parser.parse_args()
|
||||
if options.verbose:
|
||||
logging.getLogger('guessit').setLevel(logging.DEBUG)
|
||||
|
||||
if options.demo:
|
||||
run_demo(episodes=True, movies=True)
|
||||
else:
|
||||
if args:
|
||||
for filename in args:
|
||||
detect_filename(filename,
|
||||
filetype = options.filetype,
|
||||
info = options.info.split(','))
|
||||
|
||||
else:
|
||||
parser.print_help()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
0
libs/guessit/country.py
Normal file → Executable file
0
libs/guessit/country.py
Normal file → Executable file
0
libs/guessit/date.py
Normal file → Executable file
0
libs/guessit/date.py
Normal file → Executable file
7
libs/guessit/fileutils.py
Normal file → Executable file
7
libs/guessit/fileutils.py
Normal file → Executable file
@@ -22,6 +22,7 @@ from __future__ import unicode_literals
|
||||
from guessit import s, u
|
||||
import os.path
|
||||
import zipfile
|
||||
import io
|
||||
|
||||
|
||||
def split_path(path):
|
||||
@@ -76,12 +77,12 @@ def file_in_same_dir(ref_file, desired_file):
|
||||
|
||||
def load_file_in_same_dir(ref_file, filename):
|
||||
"""Load a given file. Works even when the file is contained inside a zip."""
|
||||
path = split_path(ref_file)[:-1] + [filename]
|
||||
path = split_path(ref_file)[:-1] + [str(filename)]
|
||||
|
||||
for i, p in enumerate(path):
|
||||
if p.endswith('.zip'):
|
||||
if p[-4:] == '.zip':
|
||||
zfilename = os.path.join(*path[:i + 1])
|
||||
zfile = zipfile.ZipFile(zfilename)
|
||||
return zfile.read('/'.join(path[i + 1:]))
|
||||
|
||||
return u(open(os.path.join(*path)).read())
|
||||
return u(io.open(os.path.join(*path), encoding = 'utf-8').read())
|
||||
|
||||
52
libs/guessit/guess.py
Normal file → Executable file
52
libs/guessit/guess.py
Normal file → Executable file
@@ -253,48 +253,26 @@ def merge_similar_guesses(guesses, prop, choose):
|
||||
merge_similar_guesses(guesses, prop, choose)
|
||||
|
||||
|
||||
def merge_append_guesses(guesses, prop):
|
||||
"""Take a list of guesses and merge those which have the same properties by
|
||||
appending them in a list.
|
||||
|
||||
DEPRECATED, remove with old guessers
|
||||
|
||||
"""
|
||||
similar = [guess for guess in guesses if prop in guess]
|
||||
if not similar:
|
||||
return
|
||||
|
||||
merged = similar[0]
|
||||
merged[prop] = [merged[prop]]
|
||||
# TODO: what to do with global confidence? mean of them all?
|
||||
|
||||
for m in similar[1:]:
|
||||
for prop2 in m:
|
||||
if prop == prop2:
|
||||
merged[prop].append(m[prop])
|
||||
else:
|
||||
if prop2 in m:
|
||||
log.warning('overwriting property "%s" with value %s' % (prop2, m[prop2]))
|
||||
merged[prop2] = m[prop2]
|
||||
# TODO: confidence also
|
||||
|
||||
guesses.remove(m)
|
||||
|
||||
|
||||
def merge_all(guesses, append=None):
|
||||
"""Merge all the guesses in a single result, remove very unlikely values,
|
||||
and return it.
|
||||
You can specify a list of properties that should be appended into a list
|
||||
instead of being merged.
|
||||
|
||||
>>> s(merge_all([ Guess({ 'season': 2 }, confidence = 0.6),
|
||||
... Guess({ 'episodeNumber': 13 }, confidence = 0.8) ]))
|
||||
>>> s(merge_all([ Guess({'season': 2}, confidence=0.6),
|
||||
... Guess({'episodeNumber': 13}, confidence=0.8) ]))
|
||||
{'season': 2, 'episodeNumber': 13}
|
||||
|
||||
>>> s(merge_all([ Guess({ 'episodeNumber': 27 }, confidence = 0.02),
|
||||
... Guess({ 'season': 1 }, confidence = 0.2) ]))
|
||||
>>> s(merge_all([ Guess({'episodeNumber': 27}, confidence=0.02),
|
||||
... Guess({'season': 1}, confidence=0.2) ]))
|
||||
{'season': 1}
|
||||
|
||||
>>> s(merge_all([ Guess({'other': 'PROPER'}, confidence=0.8),
|
||||
... Guess({'releaseGroup': '2HD'}, confidence=0.8) ],
|
||||
... append=['other']))
|
||||
{'releaseGroup': '2HD', 'other': ['PROPER']}
|
||||
|
||||
|
||||
"""
|
||||
if not guesses:
|
||||
return Guess()
|
||||
@@ -328,7 +306,13 @@ def merge_all(guesses, append=None):
|
||||
|
||||
# make sure our appendable properties contain unique values
|
||||
for prop in append:
|
||||
if prop in result:
|
||||
result[prop] = list(set(result[prop]))
|
||||
try:
|
||||
value = result[prop]
|
||||
if isinstance(value, list):
|
||||
result[prop] = list(set(value))
|
||||
else:
|
||||
result[prop] = [ value ]
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
return result
|
||||
|
||||
0
libs/guessit/hash_ed2k.py
Normal file → Executable file
0
libs/guessit/hash_ed2k.py
Normal file → Executable file
0
libs/guessit/hash_mpc.py
Normal file → Executable file
0
libs/guessit/hash_mpc.py
Normal file → Executable file
30
libs/guessit/language.py
Normal file → Executable file
30
libs/guessit/language.py
Normal file → Executable file
@@ -21,13 +21,14 @@
|
||||
from __future__ import unicode_literals
|
||||
from guessit import UnicodeMixin, base_text_type, u, s
|
||||
from guessit.fileutils import load_file_in_same_dir
|
||||
from guessit.textutils import find_words
|
||||
from guessit.country import Country
|
||||
import re
|
||||
import logging
|
||||
|
||||
__all__ = [ 'is_iso_language', 'is_language', 'lang_set', 'Language',
|
||||
'ALL_LANGUAGES', 'ALL_LANGUAGES_NAMES', 'UNDETERMINED',
|
||||
'search_language' ]
|
||||
'search_language', 'guess_language' ]
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
@@ -317,7 +318,7 @@ def search_language(string, lang_filter=None):
|
||||
'is', 'it', 'am', 'mad', 'men', 'man', 'run', 'sin', 'st', 'to',
|
||||
'no', 'non', 'war', 'min', 'new', 'car', 'day', 'bad', 'bat', 'fan',
|
||||
'fry', 'cop', 'zen', 'gay', 'fat', 'cherokee', 'got', 'an', 'as',
|
||||
'cat', 'her', 'be', 'hat', 'sun', 'may', 'my', 'mr',
|
||||
'cat', 'her', 'be', 'hat', 'sun', 'may', 'my', 'mr', 'rum', 'pi',
|
||||
# french words
|
||||
'bas', 'de', 'le', 'son', 'vo', 'vf', 'ne', 'ca', 'ce', 'et', 'que',
|
||||
'mal', 'est', 'vol', 'or', 'mon', 'se',
|
||||
@@ -325,7 +326,7 @@ def search_language(string, lang_filter=None):
|
||||
'la', 'el', 'del', 'por', 'mar',
|
||||
# other
|
||||
'ind', 'arw', 'ts', 'ii', 'bin', 'chan', 'ss', 'san', 'oss', 'iii',
|
||||
'vi'
|
||||
'vi', 'ben', 'da'
|
||||
])
|
||||
sep = r'[](){} \._-+'
|
||||
|
||||
@@ -334,7 +335,8 @@ def search_language(string, lang_filter=None):
|
||||
|
||||
slow = ' %s ' % string.lower()
|
||||
confidence = 1.0 # for all of them
|
||||
for lang in lng_all_names:
|
||||
|
||||
for lang in set(find_words(slow)) & lng_all_names:
|
||||
|
||||
if lang in lng_common_words:
|
||||
continue
|
||||
@@ -351,7 +353,7 @@ def search_language(string, lang_filter=None):
|
||||
if lang_filter and language not in lang_filter:
|
||||
continue
|
||||
|
||||
# only allow those languages that have a 2-letter code, those who
|
||||
# only allow those languages that have a 2-letter code, those that
|
||||
# don't are too esoteric and probably false matches
|
||||
if language.lang not in lng3_to_lng2:
|
||||
continue
|
||||
@@ -364,9 +366,25 @@ def search_language(string, lang_filter=None):
|
||||
else:
|
||||
# Note: we could either be really confident that we found a
|
||||
# language or assume that full language names are too
|
||||
# common words
|
||||
# common words and lower their confidence accordingly
|
||||
confidence = 0.3 # going with the low-confidence route here
|
||||
|
||||
return language, (pos - 1, end - 1), confidence
|
||||
|
||||
return None, None, None
|
||||
|
||||
|
||||
def guess_language(text):
|
||||
"""Guess the language in which a body of text is written.
|
||||
|
||||
This uses the external guess-language python module, and will fail and return
|
||||
Language(Undetermined) if it is not installed.
|
||||
"""
|
||||
try:
|
||||
from guess_language import guessLanguage
|
||||
return Language(guessLanguage(text))
|
||||
|
||||
except ImportError:
|
||||
log.error('Cannot detect the language of the given text body, missing dependency: guess-language')
|
||||
log.error('Please install it from PyPI, by doing eg: pip install guess-language')
|
||||
return UNDETERMINED
|
||||
|
||||
52
libs/guessit/matcher.py
Normal file → Executable file
52
libs/guessit/matcher.py
Normal file → Executable file
@@ -19,18 +19,16 @@
|
||||
#
|
||||
|
||||
from __future__ import unicode_literals
|
||||
from guessit import PY3, u
|
||||
from guessit import PY3, u, base_text_type
|
||||
from guessit.matchtree import MatchTree
|
||||
from guessit.guess import (merge_similar_guesses, merge_all,
|
||||
choose_int, choose_string)
|
||||
import copy
|
||||
from guessit.textutils import normalize_unicode
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class IterativeMatcher(object):
|
||||
def __init__(self, filename, filetype='autodetect'):
|
||||
def __init__(self, filename, filetype='autodetect', opts=None):
|
||||
"""An iterative matcher tries to match different patterns that appear
|
||||
in the filename.
|
||||
|
||||
@@ -76,6 +74,14 @@ class IterativeMatcher(object):
|
||||
raise ValueError("filetype needs to be one of %s" % valid_filetypes)
|
||||
if not PY3 and not isinstance(filename, unicode):
|
||||
log.warning('Given filename to matcher is not unicode...')
|
||||
filename = filename.decode('utf-8')
|
||||
|
||||
filename = normalize_unicode(filename)
|
||||
|
||||
if opts is None:
|
||||
opts = []
|
||||
elif isinstance(opts, base_text_type):
|
||||
opts = opts.split()
|
||||
|
||||
self.match_tree = MatchTree(filename)
|
||||
mtree = self.match_tree
|
||||
@@ -84,7 +90,7 @@ class IterativeMatcher(object):
|
||||
def apply_transfo(transfo_name, *args, **kwargs):
|
||||
transfo = __import__('guessit.transfo.' + transfo_name,
|
||||
globals=globals(), locals=locals(),
|
||||
fromlist=['process'], level=-1)
|
||||
fromlist=['process'], level=0)
|
||||
transfo.process(mtree, *args, **kwargs)
|
||||
|
||||
# 1- first split our path into dirs + basename + ext
|
||||
@@ -115,13 +121,20 @@ class IterativeMatcher(object):
|
||||
'guess_properties', 'guess_language',
|
||||
'guess_video_rexps' ]
|
||||
|
||||
if 'nolanguage' in opts:
|
||||
strategy.remove('guess_language')
|
||||
|
||||
for name in strategy:
|
||||
apply_transfo(name)
|
||||
|
||||
# more guessers for both movies and episodes
|
||||
for name in ['guess_bonus_features', 'guess_year', 'guess_country']:
|
||||
for name in ['guess_bonus_features', 'guess_year']:
|
||||
apply_transfo(name)
|
||||
|
||||
if 'nocountry' not in opts:
|
||||
apply_transfo('guess_country')
|
||||
|
||||
|
||||
# split into '-' separated subgroups (with required separator chars
|
||||
# around the dash)
|
||||
apply_transfo('split_on_dash')
|
||||
@@ -139,27 +152,4 @@ class IterativeMatcher(object):
|
||||
log.debug('Found match tree:\n%s' % u(mtree))
|
||||
|
||||
def matched(self):
|
||||
# we need to make a copy here, as the merge functions work in place and
|
||||
# calling them on the match tree would modify it
|
||||
|
||||
parts = [node.guess for node in self.match_tree.nodes() if node.guess]
|
||||
parts = copy.deepcopy(parts)
|
||||
|
||||
# 1- try to merge similar information together and give it a higher
|
||||
# confidence
|
||||
for int_part in ('year', 'season', 'episodeNumber'):
|
||||
merge_similar_guesses(parts, int_part, choose_int)
|
||||
|
||||
for string_part in ('title', 'series', 'container', 'format',
|
||||
'releaseGroup', 'website', 'audioCodec',
|
||||
'videoCodec', 'screenSize', 'episodeFormat',
|
||||
'audioChannels'):
|
||||
merge_similar_guesses(parts, string_part, choose_string)
|
||||
|
||||
# 2- merge the rest, potentially discarding information not properly
|
||||
# merged before
|
||||
result = merge_all(parts,
|
||||
append=['language', 'subtitleLanguage', 'other'])
|
||||
|
||||
log.debug('Final result: ' + result.nice_string())
|
||||
return result
|
||||
return self.match_tree.matched()
|
||||
|
||||
28
libs/guessit/matchtree.py
Normal file → Executable file
28
libs/guessit/matchtree.py
Normal file → Executable file
@@ -22,6 +22,9 @@ from __future__ import unicode_literals
|
||||
from guessit import UnicodeMixin, base_text_type, Guess
|
||||
from guessit.textutils import clean_string, str_fill
|
||||
from guessit.patterns import group_delimiters
|
||||
from guessit.guess import (merge_similar_guesses, merge_all,
|
||||
choose_int, choose_string)
|
||||
import copy
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
@@ -257,3 +260,28 @@ class MatchTree(BaseMatchTree):
|
||||
"""Return whether the group was explicitly enclosed by
|
||||
parentheses/square brackets/etc."""
|
||||
return (self.value[0] + self.value[-1]) in group_delimiters
|
||||
|
||||
def matched(self):
|
||||
# we need to make a copy here, as the merge functions work in place and
|
||||
# calling them on the match tree would modify it
|
||||
parts = [node.guess for node in self.nodes() if node.guess]
|
||||
parts = copy.deepcopy(parts)
|
||||
|
||||
# 1- try to merge similar information together and give it a higher
|
||||
# confidence
|
||||
for int_part in ('year', 'season', 'episodeNumber'):
|
||||
merge_similar_guesses(parts, int_part, choose_int)
|
||||
|
||||
for string_part in ('title', 'series', 'container', 'format',
|
||||
'releaseGroup', 'website', 'audioCodec',
|
||||
'videoCodec', 'screenSize', 'episodeFormat',
|
||||
'audioChannels'):
|
||||
merge_similar_guesses(parts, string_part, choose_string)
|
||||
|
||||
# 2- merge the rest, potentially discarding information not properly
|
||||
# merged before
|
||||
result = merge_all(parts,
|
||||
append=['language', 'subtitleLanguage', 'other'])
|
||||
|
||||
log.debug('Final result: ' + result.nice_string())
|
||||
return result
|
||||
|
||||
@@ -20,9 +20,10 @@
|
||||
#
|
||||
|
||||
from __future__ import unicode_literals
|
||||
import re
|
||||
|
||||
|
||||
subtitle_exts = [ 'srt', 'idx', 'sub', 'ssa', 'txt' ]
|
||||
subtitle_exts = [ 'srt', 'idx', 'sub', 'ssa' ]
|
||||
|
||||
video_exts = ['3g2', '3gp', '3gp2', 'asf', 'avi', 'divx', 'flv', 'm4v', 'mk2',
|
||||
'mka', 'mkv', 'mov', 'mp4', 'mp4a', 'mpeg', 'mpg', 'ogg', 'ogm',
|
||||
@@ -42,13 +43,13 @@ episode_rexps = [ # ... Season 2 ...
|
||||
(r'saison (?P<season>[0-9]+)', 1.0, (0, 0)),
|
||||
|
||||
# ... s02e13 ...
|
||||
(r'[Ss](?P<season>[0-9]{1,2}).{,3}(?P<episodeNumber>(?:[Ee][0-9]{1,2})+)[^0-9]', 1.0, (0, -1)),
|
||||
(r'[Ss](?P<season>[0-9]{1,2}).?(?P<episodeNumber>(?:[Ee-][0-9]{1,2})+)[^0-9]', 1.0, (0, -1)),
|
||||
|
||||
# ... s03-x02 ...
|
||||
(r'[Ss](?P<season>[0-9]{1,2}).{,3}(?P<bonusNumber>(?:[Xx][0-9]{1,2})+)[^0-9]', 1.0, (0, -1)),
|
||||
(r'[Ss](?P<season>[0-9]{1,2}).?(?P<bonusNumber>(?:[Xx][0-9]{1,2})+)[^0-9]', 1.0, (0, -1)),
|
||||
|
||||
# ... 2x13 ...
|
||||
(r'[^0-9](?P<season>[0-9]{1,2})(?P<episodeNumber>(?:[xX][0-9]{1,2})+)[^0-9]', 0.8, (1, -1)),
|
||||
(r'[^0-9](?P<season>[0-9]{1,2}).?(?P<episodeNumber>(?:[xX][0-9]{1,2})+)[^0-9]', 0.8, (1, -1)),
|
||||
|
||||
# ... s02 ...
|
||||
#(sep + r's(?P<season>[0-9]{1,2})' + sep, 0.6, (1, -1)),
|
||||
@@ -61,7 +62,7 @@ episode_rexps = [ # ... Season 2 ...
|
||||
('ep' + sep + r'(?P<episodeNumber>[0-9]{1,2})[^0-9]', 0.7, (0, -1)),
|
||||
|
||||
# ... e13 ... for a mini-series without a season number
|
||||
(r'e(?P<episodeNumber>[0-9]{1,2})[^0-9]', 0.6, (0, -1))
|
||||
(sep + r'e(?P<episodeNumber>[0-9]{1,2})' + sep, 0.6, (1, -1))
|
||||
|
||||
]
|
||||
|
||||
@@ -99,92 +100,129 @@ video_rexps = [ # cd number
|
||||
(r'f(?P<filmNumber>[0-9]{1,2})', 1.0, (0, 0))
|
||||
]
|
||||
|
||||
websites = [ 'tvu.org.ru', 'emule-island.com', 'UsaBit.com', 'www.divx-overnet.com', 'sharethefiles.com' ]
|
||||
websites = [ 'tvu.org.ru', 'emule-island.com', 'UsaBit.com', 'www.divx-overnet.com',
|
||||
'sharethefiles.com' ]
|
||||
|
||||
unlikely_series = ['series']
|
||||
unlikely_series = [ 'series' ]
|
||||
|
||||
properties = { 'format': [ 'DVDRip', 'HD-DVD', 'HDDVD', 'HDDVDRip', 'BluRay', 'Blu-ray', 'BDRip', 'BRRip',
|
||||
'HDRip', 'DVD', 'DVDivX', 'HDTV', 'DVB', 'DVBRip', 'PDTV', 'WEBRip',
|
||||
'DVDSCR', 'Screener', 'VHS', 'VIDEO_TS', 'WEB-DL', 'WEBDL' ],
|
||||
|
||||
'screenSize': [ '720p', '720', '1080p', '1080' ],
|
||||
# prop_multi is a dict of { property_name: { canonical_form: [ pattern ] } }
|
||||
# pattern is a string considered as a regexp, with the addition that dashes are
|
||||
# replaced with '([ \.-_])?' which matches more types of separators (or none)
|
||||
# note: simpler patterns need to be at the end of the list to not shadow more
|
||||
# complete ones, eg: 'AAC' needs to come after 'He-AAC'
|
||||
# ie: from most specific to less specific
|
||||
prop_multi = { 'format': { 'DVD': [ 'DVD', 'DVD-Rip', 'VIDEO-TS', 'DVDivX' ],
|
||||
'HD-DVD': [ 'HD-(?:DVD)?-Rip', 'HD-DVD' ],
|
||||
'BluRay': [ 'Blu-ray', 'B[DR]Rip' ],
|
||||
'HDTV': [ 'HD-TV' ],
|
||||
'DVB': [ 'DVB-Rip', 'DVB', 'PD-TV' ],
|
||||
'WEBRip': [ 'WEB-Rip' ],
|
||||
'Screener': [ 'DVD-SCR', 'Screener' ],
|
||||
'VHS': [ 'VHS' ],
|
||||
'WEB-DL': [ 'WEB-DL' ] },
|
||||
|
||||
'videoCodec': [ 'XviD', 'DivX', 'x264', 'h264', 'Rv10' ],
|
||||
'screenSize': { '480p': [ '480p?' ],
|
||||
'720p': [ '720p?' ],
|
||||
'1080p': [ '1080p?' ] },
|
||||
|
||||
'audioCodec': [ 'AC3', 'DTS', 'He-AAC', 'AAC-He', 'AAC' ],
|
||||
'videoCodec': { 'XviD': [ 'Xvid' ],
|
||||
'DivX': [ 'DVDivX', 'DivX' ],
|
||||
'h264': [ '[hx]-264' ],
|
||||
'Rv10': [ 'Rv10' ] },
|
||||
|
||||
'audioChannels': [ '5.1' ],
|
||||
'audioCodec': { 'AC3': [ 'AC3' ],
|
||||
'DTS': [ 'DTS' ],
|
||||
'AAC': [ 'He-AAC', 'AAC-He', 'AAC' ] },
|
||||
|
||||
'releaseGroup': [ 'ESiR', 'WAF', 'SEPTiC', '[XCT]', 'iNT', 'PUKKA',
|
||||
'CHD', 'ViTE', 'TLF', 'DEiTY', 'FLAiTE',
|
||||
'MDX', 'GM4F', 'DVL', 'SVD', 'iLUMiNADOS', ' FiNaLe',
|
||||
'UnSeeN', 'aXXo', 'KLAXXON', 'NoTV', 'ZeaL', 'LOL',
|
||||
'SiNNERS', 'DiRTY', 'REWARD', 'ECI', 'KiNGS', 'CLUE',
|
||||
'CtrlHD', 'POD', 'WiKi', 'DIMENSION', 'IMMERSE', 'FQM',
|
||||
'2HD', 'REPTiLE', 'CTU', 'HALCYON', 'EbP', 'SiTV', 'SAiNTS',
|
||||
'HDBRiSe', 'AlFleNi-TeaM', 'EVOLVE', '0TV' ],
|
||||
'audioChannels': { '5.1': [ r'5\.1', 'DD5\.1', '5ch' ] },
|
||||
|
||||
'episodeFormat': [ 'Minisode', 'Minisodes' ],
|
||||
'episodeFormat': { 'Minisode': [ 'Minisodes?' ] }
|
||||
|
||||
'other': [ '5ch', 'PROPER', 'REPACK', 'LIMITED', 'DualAudio', 'iNTERNAL', 'Audiofixed', 'R5',
|
||||
'complete', 'classic', # not so sure about these ones, could appear in a title
|
||||
'ws', # widescreen
|
||||
],
|
||||
}
|
||||
|
||||
# prop_single dict of { property_name: [ canonical_form ] }
|
||||
prop_single = { 'releaseGroup': [ 'ESiR', 'WAF', 'SEPTiC', r'\[XCT\]', 'iNT', 'PUKKA',
|
||||
'CHD', 'ViTE', 'TLF', 'DEiTY', 'FLAiTE',
|
||||
'MDX', 'GM4F', 'DVL', 'SVD', 'iLUMiNADOS', 'FiNaLe',
|
||||
'UnSeeN', 'aXXo', 'KLAXXON', 'NoTV', 'ZeaL', 'LOL',
|
||||
'SiNNERS', 'DiRTY', 'REWARD', 'ECI', 'KiNGS', 'CLUE',
|
||||
'CtrlHD', 'POD', 'WiKi', 'DIMENSION', 'IMMERSE', 'FQM',
|
||||
'2HD', 'REPTiLE', 'CTU', 'HALCYON', 'EbP', 'SiTV',
|
||||
'SAiNTS', 'HDBRiSe', 'AlFleNi-TeaM', 'EVOLVE', '0TV',
|
||||
'TLA', 'NTB', 'ASAP', 'MOMENTUM', 'FoV', 'D-Z0N3' ],
|
||||
|
||||
def find_properties(filename):
|
||||
'other': [ 'PROPER', 'REPACK', 'LIMITED', 'DualAudio', 'Audiofixed', 'R5',
|
||||
'complete', 'classic', # not so sure about these ones, could appear in a title
|
||||
'ws' ] # widescreen
|
||||
}
|
||||
|
||||
_dash = '-'
|
||||
_psep = '[-\. _]?'
|
||||
|
||||
def _to_rexp(prop):
|
||||
return re.compile(prop.replace(_dash, _psep), re.IGNORECASE)
|
||||
|
||||
# properties_rexps dict of { property_name: { canonical_form: [ rexp ] } }
|
||||
# containing the rexps compiled from both prop_multi and prop_single
|
||||
properties_rexps = dict((type, dict((canonical_form,
|
||||
[ _to_rexp(pattern) for pattern in patterns ])
|
||||
for canonical_form, patterns in props.items()))
|
||||
for type, props in prop_multi.items())
|
||||
|
||||
properties_rexps.update(dict((type, dict((canonical_form, [ _to_rexp(canonical_form) ])
|
||||
for canonical_form in props))
|
||||
for type, props in prop_single.items()))
|
||||
|
||||
|
||||
|
||||
def find_properties(string):
|
||||
result = []
|
||||
clow = filename.lower()
|
||||
for prop, values in properties.items():
|
||||
for value in values:
|
||||
pos = clow.find(value.lower())
|
||||
if pos != -1:
|
||||
end = pos + len(value)
|
||||
# make sure our word is always surrounded by separators
|
||||
if ((pos > 0 and clow[pos - 1] not in sep) or
|
||||
(end < len(clow) and clow[end] not in sep)):
|
||||
for property_name, props in properties_rexps.items():
|
||||
for canonical_form, rexps in props.items():
|
||||
for value_rexp in rexps:
|
||||
match = value_rexp.search(string)
|
||||
if match:
|
||||
start, end = match.span()
|
||||
# make sure our word is always surrounded by separators
|
||||
# note: sep is a regexp, but in this case using it as
|
||||
# a sequence achieves the same goal
|
||||
continue
|
||||
# a char sequence achieves the same goal
|
||||
if ((start > 0 and string[start-1] not in sep) or
|
||||
(end < len(string) and string[end] not in sep)):
|
||||
continue
|
||||
|
||||
result.append((prop, value, pos, end))
|
||||
result.append((property_name, canonical_form, start, end))
|
||||
return result
|
||||
|
||||
|
||||
property_synonyms = { 'DVD': [ 'DVDRip', 'VIDEO_TS' ],
|
||||
'HD-DVD': [ 'HDDVD', 'HDDVDRip' ],
|
||||
'BluRay': [ 'BDRip', 'BRRip', 'Blu-ray' ],
|
||||
'WEB-DL': [ 'WEBDL' ],
|
||||
'DVB': [ 'DVBRip', 'PDTV' ],
|
||||
'Screener': [ 'DVDSCR' ],
|
||||
'DivX': [ 'DVDivX' ],
|
||||
'h264': [ 'x264' ],
|
||||
'720p': [ '720' ],
|
||||
'1080p': [ '1080' ],
|
||||
'AAC': [ 'He-AAC', 'AAC-He' ],
|
||||
'Special Edition': [ 'Special' ],
|
||||
property_synonyms = { 'Special Edition': [ 'Special' ],
|
||||
'Collector Edition': [ 'Collector' ],
|
||||
'Criterion Edition': [ 'Criterion' ],
|
||||
'Minisode': [ 'Minisodes' ]
|
||||
'Criterion Edition': [ 'Criterion' ]
|
||||
}
|
||||
|
||||
|
||||
def revert_synonyms():
|
||||
reverse = {}
|
||||
|
||||
for _, values in properties.items():
|
||||
for value in values:
|
||||
reverse[value.lower()] = value
|
||||
|
||||
for canonical, synonyms in property_synonyms.items():
|
||||
for synonym in synonyms:
|
||||
reverse[synonym.lower()] = canonical
|
||||
|
||||
return reverse
|
||||
|
||||
|
||||
reverse_synonyms = revert_synonyms()
|
||||
|
||||
|
||||
def canonical_form(string):
|
||||
return reverse_synonyms.get(string.lower(), string)
|
||||
|
||||
|
||||
def compute_canonical_form(property_name, value):
|
||||
"""Return the canonical form of a property given its type if it is a valid
|
||||
one, None otherwise."""
|
||||
for canonical_form, rexps in properties_rexps[property_name].items():
|
||||
for rexp in rexps:
|
||||
if rexp.match(value):
|
||||
return canonical_form
|
||||
return None
|
||||
|
||||
56
libs/guessit/slogging.py
Normal file → Executable file
56
libs/guessit/slogging.py
Normal file → Executable file
@@ -21,6 +21,8 @@
|
||||
from __future__ import unicode_literals
|
||||
import logging
|
||||
import sys
|
||||
import os, os.path
|
||||
|
||||
|
||||
GREEN_FONT = "\x1B[0;32m"
|
||||
YELLOW_FONT = "\x1B[0;33m"
|
||||
@@ -29,33 +31,57 @@ RED_FONT = "\x1B[0;31m"
|
||||
RESET_FONT = "\x1B[0m"
|
||||
|
||||
|
||||
def setupLogging(colored=True):
|
||||
def setupLogging(colored=True, with_time=False, with_thread=False, filename=None):
|
||||
"""Set up a nice colored logger as the main application logger."""
|
||||
|
||||
class SimpleFormatter(logging.Formatter):
|
||||
def __init__(self):
|
||||
self.fmt = '%(levelname)-8s %(module)s:%(funcName)s -- %(message)s'
|
||||
def __init__(self, with_time, with_thread):
|
||||
self.fmt = (('%(asctime)s ' if with_time else '') +
|
||||
'%(levelname)-8s ' +
|
||||
'[%(name)s:%(funcName)s]' +
|
||||
('[%(threadName)s]' if with_thread else '') +
|
||||
' -- %(message)s')
|
||||
logging.Formatter.__init__(self, self.fmt)
|
||||
|
||||
class ColoredFormatter(logging.Formatter):
|
||||
def __init__(self):
|
||||
self.fmt = ('%(levelname)-8s ' +
|
||||
BLUE_FONT + '%(name)s:%(funcName)s' +
|
||||
RESET_FONT + ' -- %(message)s')
|
||||
def __init__(self, with_time, with_thread):
|
||||
self.fmt = (('%(asctime)s ' if with_time else '') +
|
||||
'-CC-%(levelname)-8s ' +
|
||||
BLUE_FONT + '[%(name)s:%(funcName)s]' +
|
||||
RESET_FONT + ('[%(threadName)s]' if with_thread else '') +
|
||||
' -- %(message)s')
|
||||
|
||||
logging.Formatter.__init__(self, self.fmt)
|
||||
|
||||
def format(self, record):
|
||||
modpath = record.name.split('.')
|
||||
record.mname = modpath[0]
|
||||
record.mmodule = '.'.join(modpath[1:])
|
||||
result = logging.Formatter.format(self, record)
|
||||
if record.levelno in (logging.DEBUG, logging.INFO):
|
||||
return GREEN_FONT + result
|
||||
if record.levelno == logging.DEBUG:
|
||||
color = BLUE_FONT
|
||||
elif record.levelno == logging.INFO:
|
||||
color = GREEN_FONT
|
||||
elif record.levelno == logging.WARNING:
|
||||
return YELLOW_FONT + result
|
||||
color = YELLOW_FONT
|
||||
else:
|
||||
return RED_FONT + result
|
||||
color = RED_FONT
|
||||
|
||||
ch = logging.StreamHandler()
|
||||
if colored and sys.platform != 'win32':
|
||||
ch.setFormatter(ColoredFormatter())
|
||||
result = result.replace('-CC-', color)
|
||||
return result
|
||||
|
||||
if filename is not None:
|
||||
# make sure we can write to our log file
|
||||
logdir = os.path.dirname(filename)
|
||||
if not os.path.exists(logdir):
|
||||
os.makedirs(logdir)
|
||||
ch = logging.FileHandler(filename, mode='w')
|
||||
ch.setFormatter(SimpleFormatter(with_time, with_thread))
|
||||
else:
|
||||
ch.setFormatter(SimpleFormatter())
|
||||
ch = logging.StreamHandler()
|
||||
if colored and sys.platform != 'win32':
|
||||
ch.setFormatter(ColoredFormatter(with_time, with_thread))
|
||||
else:
|
||||
ch.setFormatter(SimpleFormatter(with_time, with_thread))
|
||||
|
||||
logging.getLogger().addHandler(ch)
|
||||
|
||||
22
libs/guessit/textutils.py
Normal file → Executable file
22
libs/guessit/textutils.py
Normal file → Executable file
@@ -2,7 +2,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# Smewt - A smart collection manager
|
||||
# Copyright (c) 2008 Nicolas Wack <wackou@gmail.com>
|
||||
# Copyright (c) 2008-2012 Nicolas Wack <wackou@gmail.com>
|
||||
#
|
||||
# Smewt is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
@@ -23,10 +23,13 @@ from guessit import s
|
||||
from guessit.patterns import sep
|
||||
import functools
|
||||
import unicodedata
|
||||
import copy
|
||||
import re
|
||||
|
||||
# string-related functions
|
||||
|
||||
def normalize_unicode(s):
|
||||
return unicodedata.normalize('NFC', s)
|
||||
|
||||
|
||||
def strip_brackets(s):
|
||||
if not s:
|
||||
@@ -55,6 +58,21 @@ def clean_string(s):
|
||||
return result
|
||||
|
||||
|
||||
_words_rexp = re.compile('\w+', re.UNICODE)
|
||||
|
||||
def find_words(s):
|
||||
return _words_rexp.findall(s.replace('_', ' '))
|
||||
|
||||
|
||||
def reorder_title(title):
|
||||
ltitle = title.lower()
|
||||
if ltitle[-4:] == ',the':
|
||||
return title[-3:] + ' ' + title[:-4]
|
||||
if ltitle[-5:] == ', the':
|
||||
return title[-3:] + ' ' + title[:-5]
|
||||
return title
|
||||
|
||||
|
||||
def str_replace(string, pos, c):
|
||||
return string[:pos] + c + string[pos+1:]
|
||||
|
||||
|
||||
2
libs/guessit/transfo/__init__.py
Normal file → Executable file
2
libs/guessit/transfo/__init__.py
Normal file → Executable file
@@ -45,7 +45,7 @@ def format_guess(guess):
|
||||
elif isinstance(value, base_text_type):
|
||||
if prop in ('edition',):
|
||||
value = clean_string(value)
|
||||
guess[prop] = canonical_form(value)
|
||||
guess[prop] = canonical_form(value).replace('\\', '')
|
||||
|
||||
return guess
|
||||
|
||||
|
||||
0
libs/guessit/transfo/guess_bonus_features.py
Normal file → Executable file
0
libs/guessit/transfo/guess_bonus_features.py
Normal file → Executable file
24
libs/guessit/transfo/guess_country.py
Normal file → Executable file
24
libs/guessit/transfo/guess_country.py
Normal file → Executable file
@@ -19,24 +19,30 @@
|
||||
#
|
||||
|
||||
from __future__ import unicode_literals
|
||||
#from guessit.transfo import SingleNodeGuesser
|
||||
#from guessit.date import search_year
|
||||
from guessit.country import Country
|
||||
from guessit import Guess
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
# list of common words which could be interpreted as countries, but which
|
||||
# are far too common to be able to say they represent a country
|
||||
country_common_words = frozenset([ 'bt', 'bb' ])
|
||||
|
||||
def process(mtree):
|
||||
for node in mtree.unidentified_leaves():
|
||||
# only keep explicit groups (enclosed in parentheses/brackets)
|
||||
if len(node.node_idx) == 2:
|
||||
try:
|
||||
country = Country(node.value[1:-1], strict=True)
|
||||
if node.value[0] + node.value[-1] not in ['()', '[]', '{}']:
|
||||
continue
|
||||
node.guess = Guess(country=country, confidence=1.0)
|
||||
c = node.value[1:-1].lower()
|
||||
if c in country_common_words:
|
||||
continue
|
||||
|
||||
# only keep explicit groups (enclosed in parentheses/brackets)
|
||||
if node.value[0] + node.value[-1] not in ['()', '[]', '{}']:
|
||||
continue
|
||||
|
||||
try:
|
||||
country = Country(c, strict=True)
|
||||
except ValueError:
|
||||
pass
|
||||
continue
|
||||
|
||||
node.guess = Guess(country=country, confidence=1.0)
|
||||
|
||||
0
libs/guessit/transfo/guess_date.py
Normal file → Executable file
0
libs/guessit/transfo/guess_date.py
Normal file → Executable file
0
libs/guessit/transfo/guess_episode_info_from_position.py
Normal file → Executable file
0
libs/guessit/transfo/guess_episode_info_from_position.py
Normal file → Executable file
0
libs/guessit/transfo/guess_episodes_rexps.py
Normal file → Executable file
0
libs/guessit/transfo/guess_episodes_rexps.py
Normal file → Executable file
6
libs/guessit/transfo/guess_filetype.py
Normal file → Executable file
6
libs/guessit/transfo/guess_filetype.py
Normal file → Executable file
@@ -21,7 +21,7 @@
|
||||
from __future__ import unicode_literals
|
||||
from guessit import Guess
|
||||
from guessit.patterns import (subtitle_exts, video_exts, episode_rexps,
|
||||
find_properties, canonical_form)
|
||||
find_properties, compute_canonical_form)
|
||||
from guessit.date import valid_year
|
||||
from guessit.textutils import clean_string
|
||||
import os.path
|
||||
@@ -89,7 +89,7 @@ def guess_filetype(mtree, filetype):
|
||||
|
||||
# check whether we are in a 'Movies', 'Tv Shows', ... folder
|
||||
folder_rexps = [ (r'Movies?', upgrade_movie),
|
||||
(r'Tv ?Shows?', upgrade_episode),
|
||||
(r'Tv[ _-]?Shows?', upgrade_episode),
|
||||
(r'Series', upgrade_episode)
|
||||
]
|
||||
for frexp, upgrade_func in folder_rexps:
|
||||
@@ -142,7 +142,7 @@ def guess_filetype(mtree, filetype):
|
||||
upgrade_episode()
|
||||
break
|
||||
|
||||
elif canonical_form(value) == 'DVB':
|
||||
elif compute_canonical_form('format', value) == 'DVB':
|
||||
upgrade_episode()
|
||||
break
|
||||
|
||||
|
||||
15
libs/guessit/transfo/guess_language.py
Normal file → Executable file
15
libs/guessit/transfo/guess_language.py
Normal file → Executable file
@@ -22,7 +22,7 @@ from __future__ import unicode_literals
|
||||
from guessit import Guess
|
||||
from guessit.transfo import SingleNodeGuesser
|
||||
from guessit.language import search_language
|
||||
from guessit.textutils import clean_string
|
||||
from guessit.textutils import clean_string, find_words
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
@@ -31,18 +31,13 @@ log = logging.getLogger(__name__)
|
||||
def guess_language(string):
|
||||
language, span, confidence = search_language(string)
|
||||
if language:
|
||||
# is it a subtitle language?
|
||||
if 'sub' in clean_string(string[:span[0]]).lower().split(' '):
|
||||
return (Guess({'subtitleLanguage': language},
|
||||
confidence=confidence),
|
||||
span)
|
||||
else:
|
||||
return (Guess({'language': language},
|
||||
confidence=confidence),
|
||||
span)
|
||||
return (Guess({'language': language},
|
||||
confidence=confidence),
|
||||
span)
|
||||
|
||||
return None, None
|
||||
|
||||
|
||||
def process(mtree):
|
||||
SingleNodeGuesser(guess_language, None, log).process(mtree)
|
||||
# Note: 'language' is promoted to 'subtitleLanguage' in the post_process transfo
|
||||
|
||||
1
libs/guessit/transfo/guess_movie_title_from_position.py
Normal file → Executable file
1
libs/guessit/transfo/guess_movie_title_from_position.py
Normal file → Executable file
@@ -20,6 +20,7 @@
|
||||
|
||||
from __future__ import unicode_literals
|
||||
from guessit import Guess
|
||||
import unicodedata
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
0
libs/guessit/transfo/guess_properties.py
Normal file → Executable file
0
libs/guessit/transfo/guess_properties.py
Normal file → Executable file
50
libs/guessit/transfo/guess_release_group.py
Normal file → Executable file
50
libs/guessit/transfo/guess_release_group.py
Normal file → Executable file
@@ -20,49 +20,51 @@
|
||||
|
||||
from __future__ import unicode_literals
|
||||
from guessit.transfo import SingleNodeGuesser
|
||||
from guessit.patterns import properties, canonical_form
|
||||
from guessit.patterns import prop_multi, compute_canonical_form, _dash, _psep
|
||||
import re
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
def get_patterns(property_name):
|
||||
return [ p.replace(_dash, _psep) for patterns in prop_multi[property_name].values() for p in patterns ]
|
||||
|
||||
CODECS = properties['videoCodec']
|
||||
FORMATS = properties['format']
|
||||
CODECS = get_patterns('videoCodec')
|
||||
FORMATS = get_patterns('format')
|
||||
|
||||
GROUP_NAMES = [ r'(?P<videoCodec>' + codec + r')-?(?P<releaseGroup>.*?)[ \.]'
|
||||
for codec in CODECS ]
|
||||
GROUP_NAMES += [ r'(?P<format>' + fmt + r')-?(?P<releaseGroup>.*?)[ \.]'
|
||||
for fmt in FORMATS ]
|
||||
|
||||
GROUP_NAMES2 = [ r'\.(?P<videoCodec>' + codec + r')-(?P<releaseGroup>.*?)(-(.*?))?[ \.]'
|
||||
for codec in CODECS ]
|
||||
GROUP_NAMES2 += [ r'\.(?P<format>' + fmt + r')-(?P<releaseGroup>.*?)(-(.*?))?[ \.]'
|
||||
for fmt in FORMATS ]
|
||||
|
||||
GROUP_NAMES = [ re.compile(r, re.IGNORECASE) for r in GROUP_NAMES ]
|
||||
GROUP_NAMES2 = [ re.compile(r, re.IGNORECASE) for r in GROUP_NAMES2 ]
|
||||
|
||||
def adjust_metadata(md):
|
||||
codec = canonical_form(md['videoCodec'])
|
||||
if codec in FORMATS:
|
||||
md['format'] = codec
|
||||
del md['videoCodec']
|
||||
return md
|
||||
return dict((property_name, compute_canonical_form(property_name, value) or value)
|
||||
for property_name, value in md.items())
|
||||
|
||||
|
||||
def guess_release_group(string):
|
||||
group_names = [ r'\.(Xvid)-(?P<releaseGroup>.*?)[ \.]',
|
||||
r'\.(DivX)-(?P<releaseGroup>.*?)[\. ]',
|
||||
r'\.(DVDivX)-(?P<releaseGroup>.*?)[\. ]',
|
||||
]
|
||||
|
||||
# first try to see whether we have both a known codec and a known release group
|
||||
group_names = [ r'\.(?P<videoCodec>' + codec + r')-(?P<releaseGroup>.*?)[ \.]'
|
||||
for codec in (CODECS + FORMATS) ]
|
||||
|
||||
for rexp in group_names:
|
||||
match = re.search(rexp, string, re.IGNORECASE)
|
||||
for rexp in GROUP_NAMES:
|
||||
match = rexp.search(string)
|
||||
if match:
|
||||
metadata = match.groupdict()
|
||||
if canonical_form(metadata['releaseGroup']) in properties['releaseGroup']:
|
||||
release_group = compute_canonical_form('releaseGroup', metadata['releaseGroup'])
|
||||
if release_group:
|
||||
return adjust_metadata(metadata), (match.start(1), match.end(2))
|
||||
|
||||
# pick anything as releaseGroup as long as we have a codec in front
|
||||
# this doesn't include a potential dash ('-') ending the release group
|
||||
# eg: [...].X264-HiS@SiLUHD-English.[...]
|
||||
group_names = [ r'\.(?P<videoCodec>' + codec + r')-(?P<releaseGroup>.*?)(-(.*?))?[ \.]'
|
||||
for codec in (CODECS + FORMATS) ]
|
||||
|
||||
for rexp in group_names:
|
||||
match = re.search(rexp, string, re.IGNORECASE)
|
||||
for rexp in GROUP_NAMES2:
|
||||
match = rexp.search(string)
|
||||
if match:
|
||||
return adjust_metadata(match.groupdict()), (match.start(1), match.end(2))
|
||||
|
||||
|
||||
0
libs/guessit/transfo/guess_video_rexps.py
Normal file → Executable file
0
libs/guessit/transfo/guess_video_rexps.py
Normal file → Executable file
0
libs/guessit/transfo/guess_weak_episodes_rexps.py
Normal file → Executable file
0
libs/guessit/transfo/guess_weak_episodes_rexps.py
Normal file → Executable file
0
libs/guessit/transfo/guess_website.py
Normal file → Executable file
0
libs/guessit/transfo/guess_website.py
Normal file → Executable file
0
libs/guessit/transfo/guess_year.py
Normal file → Executable file
0
libs/guessit/transfo/guess_year.py
Normal file → Executable file
19
libs/guessit/transfo/post_process.py
Normal file → Executable file
19
libs/guessit/transfo/post_process.py
Normal file → Executable file
@@ -20,6 +20,7 @@
|
||||
|
||||
from __future__ import unicode_literals
|
||||
from guessit.patterns import subtitle_exts
|
||||
from guessit.textutils import reorder_title, find_words
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
@@ -45,6 +46,15 @@ def process(mtree):
|
||||
node == mtree.leaves()[-2]):
|
||||
promote_subtitle()
|
||||
|
||||
# - if we find the word 'sub' before the language, and in the same explicit
|
||||
# group, then upgrade the language
|
||||
explicit_group = mtree.node_at(node.node_idx[:2])
|
||||
group_str = explicit_group.value.lower()
|
||||
|
||||
if ('sub' in find_words(group_str) and
|
||||
0 <= group_str.find('sub') < (node.span[0] - explicit_group.span[0])):
|
||||
promote_subtitle()
|
||||
|
||||
# - if a language is in an explicit group just preceded by "st",
|
||||
# it is a subtitle language (eg: '...st[fr-eng]...')
|
||||
try:
|
||||
@@ -60,11 +70,4 @@ def process(mtree):
|
||||
if 'series' not in node.guess:
|
||||
continue
|
||||
|
||||
series = node.guess['series']
|
||||
lseries = series.lower()
|
||||
|
||||
if lseries[-4:] == ',the':
|
||||
node.guess['series'] = 'The ' + series[:-4]
|
||||
|
||||
if lseries[-5:] == ', the':
|
||||
node.guess['series'] = 'The ' + series[:-5]
|
||||
node.guess['series'] = reorder_title(node.guess['series'])
|
||||
|
||||
0
libs/guessit/transfo/split_explicit_groups.py
Normal file → Executable file
0
libs/guessit/transfo/split_explicit_groups.py
Normal file → Executable file
10
libs/guessit/transfo/split_on_dash.py
Normal file → Executable file
10
libs/guessit/transfo/split_on_dash.py
Normal file → Executable file
@@ -38,15 +38,5 @@ def process(mtree):
|
||||
indices.extend([ span[0], span[1] ])
|
||||
match = pattern.search(node.value, span[1])
|
||||
|
||||
didx = node.value.find('-')
|
||||
while didx > 0:
|
||||
if (didx > 10 and
|
||||
(didx - 1 not in indices and
|
||||
didx + 2 not in indices)):
|
||||
|
||||
indices.extend([ didx, didx + 1 ])
|
||||
|
||||
didx = node.value.find('-', didx + 1)
|
||||
|
||||
if indices:
|
||||
node.partition(indices)
|
||||
|
||||
0
libs/guessit/transfo/split_path_components.py
Normal file → Executable file
0
libs/guessit/transfo/split_path_components.py
Normal file → Executable file
Reference in New Issue
Block a user