From ebd863ab0ac0d80b73d4cd9c604a08624368bf24 Mon Sep 17 00:00:00 2001 From: mdipierro Date: Mon, 16 Jul 2012 15:36:47 -0500 Subject: [PATCH] added utf8 and utf8 support in Markminn, thanks Vladyslav --- VERSION | 2 +- gluon/utf8.py | 651 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 652 insertions(+), 1 deletion(-) create mode 100644 gluon/utf8.py diff --git a/VERSION b/VERSION index 4ab61e83..c60db99a 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -Version 2.00.0 (2012-07-16 14:25:16) dev +Version 2.00.0 (2012-07-16 15:36:41) dev diff --git a/gluon/utf8.py b/gluon/utf8.py new file mode 100644 index 00000000..71e89b1b --- /dev/null +++ b/gluon/utf8.py @@ -0,0 +1,651 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +This file is part of the web2py Web Framework +Copyrighted by Massimo Di Pierro +License: LGPLv3 (http://www.gnu.org/licenses/lgpl.html) + +Created by Vladyslav Kozlovskyy (Ukraine) + for Web2py project + +Utilities and class for UTF8 strings managing +=========================================== +""" +import __builtin__ +__all__ = ['Utf8'] + +repr_escape_tab={} +for i in xrange(1,32): repr_escape_tab[i]=ur'\x%02i'%i +repr_escape_tab[7]=u'\\a' +repr_escape_tab[8]=u'\\b' +repr_escape_tab[9]=u'\\t' +repr_escape_tab[10]=u'\\n' +repr_escape_tab[11]=u'\\v' +repr_escape_tab[12]=u'\\f' +repr_escape_tab[13]=u'\\r' +repr_escape_tab[ord('\\')]=u'\\\\' +repr_escape_tab2=repr_escape_tab.copy() +repr_escape_tab2[ord('\'')]=u"\\'" + +def sort_key(s): + """ Unicode Collation Algorithm (UCA) (http://www.unicode.org/reports/tr10/) + is used for utf-8 and unicode strings sorting and for utf-8 strings + comparison + + NOTE: pyuca is a very memory cost module! It loads the whole + "allkey.txt" file (~2mb!) into the memory. But this + functionality is needed only when sort_key() is called as a + part of sort() function or when Utf8 strings are compared. + + So, it is a lazy "sort_key" function which (ONLY ONCE, ON ITS + FIRST CALL) imports pyuca and replaces itself with a real + sort_key() function + """ + global sort_key + from contrib.pyuca import unicode_collator + unicode_sort_key = unicode_collator.sort_key + sort_key=lambda s: unicode_sort_key( + unicode(s, 'utf-8') if isinstance(s, str) else s) + return sort_key(s) + + +def ord(char): + """ returns unicode id for utf8 or unicode *char* character + + SUPPOSE that *char* is an utf-8 or unicode character only + """ + if isinstance(char, unicode): return __builtin__.ord(char) + return __builtin__.ord(unicode(char, 'utf-8')) + +def chr(code): + """ return utf8-character with *code* unicode id """ + return Utf8(unichr(code)) + +def size(string): + """ return length of utf-8 string in bytes + NOTE! The length of correspondent utf-8 + string is returned for unicode string + """ + return Utf8(string).__size__() + +class Utf8(str): + """ + Class for utf8 string storing and manipulations + + The base presupposition of this class usage is: + "ALL strings in the application are either of + utf-8 or unicode type, even when simple str + type is used. UTF-8 is only a "packed" version + of unicode, so Utf-8 and unicode strings are + interchangeable." + + CAUTION! This class is slower than str/unicode! + Do NOT use it inside intensive loops. Simply + decode string(s) to unicode before loop and + encode it back to utf-8 string(s) after + intensive calculation. + + You can see the benefit of this class in doctests() below + """ + def __new__(cls, content='', codepage='utf-8'): + if isinstance(content, unicode): + return str.__new__(cls, unicode.encode(content, 'utf-8')) + elif codepage in ('utf-8', 'utf8') or isinstance(content, cls): + return str.__new__(cls, content) + else: + return str.__new__(cls, unicode(content, codepage).encode('utf-8')) + + def __repr__(self): + r''' # note that we use raw strings to avoid having to use double back slashes below + NOTE! This function is a clone of web2py:gluon.languages.utf_repl() function + + utf8.__repr__() works same as str.repr() when processing ascii string + >>> repr(Utf8('abc')) == repr(Utf8("abc")) == repr('abc') == repr("abc") == "'abc'" + True + >>> repr(Utf8('a"b"c')) == repr('a"b"c') == '\'a"b"c\'' + True + >>> repr(Utf8("a'b'c")) == repr("a'b'c") == '"a\'b\'c"' + True + >>> repr(Utf8('a\'b"c')) == repr('a\'b"c') == repr(Utf8("a'b\"c")) == repr("a'b\"c") == '\'a\\\'b"c\'' + True + >>> repr(Utf8('a\r\nb')) == repr('a\r\nb') == "'a\\r\\nb'" # Test for \r, \n + True + + Unlike str.repr(), Utf8.__repr__() remains utf8 content when processing utf8 string + >>> repr(Utf8('中文字')) == repr(Utf8("中文字")) == "'中文字'" != repr('中文字') + True + >>> repr(Utf8('中"文"字')) == "'中\"文\"字'" != repr('中"文"字') + True + >>> repr(Utf8("中'文'字")) == '"中\'文\'字"' != repr("中'文'字") + True + >>> repr(Utf8('中\'文"字')) == repr(Utf8("中'文\"字")) == '\'中\\\'文"字\'' != repr('中\'文"字') == repr("中'文\"字") + True + >>> repr(Utf8('中\r\n文')) == "'中\\r\\n文'" != repr('中\r\n文') # Test for \r, \n + True + ''' + if str.find(self,"'") >= 0 and str.find(self,'"') < 0: # only single quote exists + return '"'+unicode(self, 'utf-8').translate(repr_escape_tab).encode('utf-8')+'"' + else: + return "'"+unicode(self, 'utf-8').translate(repr_escape_tab2).encode('utf-8')+"'" + + def truncate(self, length, dots='...'): + """ returns string of length < *length* or truncate + string with adding *dots* suffix to the string's end + + args: + length (int): max length of string + dots (str or unicode): string suffix, when string is cutted + + returns: + (utf8-str): original or cutted string + """ + text = unicode(self, 'utf-8') + dots = unicode(dots, 'utf-8') if isinstance(dots, str) else dots + if len(text) > length: + text = text[:length-len(dots)] + dots + return str.__new__(Utf8, text.encode('utf-8')) + + def __size__(self): + """ length of utf-8 string in bytes """ + return str.__len__(self) + + def __contains__(self, other): return str.__contains__(self, Utf8(other)) + def __getitem__(self, index): return str.__new__(Utf8, unicode(self, 'utf-8')[index].encode('utf-8')) + def __getslice__(self, begin, end): return str.__new__(Utf8, unicode(self, 'utf-8')[begin:end].encode('utf-8')) + + def __add__(self, other): return str.__new__(Utf8, str.__add__(self, unicode.encode(other, 'utf-8') + if isinstance(other, unicode) else other)) + def __len__(self): return len(unicode(self, 'utf-8')) + def __mul__(self, integer):return str.__new__(Utf8, str.__mul__(self, integer)) + def __eq__(self, string): return str.__eq__(self, Utf8(string)) + def __ne__(self, string): return str.__ne__(self, Utf8(string)) + def capitalize(self): return str.__new__(Utf8, unicode(self, 'utf-8').capitalize().encode('utf-8')) + def center(self, length): return str.__new__(Utf8, unicode(self, 'utf-8').center(length).encode('utf-8')) + def upper(self): return str.__new__(Utf8, unicode(self, 'utf-8').upper().encode('utf-8')) + def lower(self): return str.__new__(Utf8, unicode(self, 'utf-8').lower().encode('utf-8')) + def title(self): return str.__new__(Utf8, unicode(self, 'utf-8').title().encode('utf-8')) + def index(self, string): return unicode(self, 'utf-8').index(string if isinstance(string,unicode) else unicode(string, 'utf-8')) + def isalnum(self): return unicode(self, 'utf-8').isalnum() + def isalpha(self): return unicode(self, 'utf-8').isalpha() + def isdigit(self): return unicode(self, 'utf-8').isdigit() + def islower(self): return unicode(self, 'utf-8').islower() + def isspace(self): return unicode(self, 'utf-8').isspace() + def istitle(self): return unicode(self, 'utf-8').istitle() + def isupper(self): return unicode(self, 'utf-8').isupper() + def zfill(self, length): return str.__new__(Utf8, unicode(self, 'utf-8').zfill(length).encode('utf-8')) + def join(self, iter): return str.__new__(Utf8, str.join(self, [Utf8(c) for c in + list(unicode(iter, 'utf-8') if + isinstance(iter, str) else + iter)])) + def lstrip(self, chars=None): return str.__new__(Utf8, str.lstrip(self, None if chars is None else Utf8(chars))) + def rstrip(self, chars=None ): return str.__new__(Utf8, str.rstrip(self, None if chars is None else Utf8(chars))) + def strip(self, chars=None): return str.__new__(Utf8, str.strip(self, None if chars is None else Utf8(chars))) + def swapcase(self): return str.__new__(Utf8, unicode(self, 'utf-8').swapcase().encode('utf-8')) + + def count(self, sub, start=0, end=None): + unistr = unicode(self, 'utf-8') + return unistr.count(unicode(sub, 'utf-8') if isinstance(sub, str) else sub, + start, len(unistr) if end is None else end) + def decode(self, encoding='utf-8', errors='strict'): return str.decode(self, encoding, errors) + def encode(self, encoding, errors='strict'): return unicode(self, 'utf-8').encode(encoding, errors) + def expandtabs(self, tabsize=8): return str.__new__(Utf8, unicode(self, 'utf-8').expandtabs(tabsize).encode('utf-8')) + def find(self, sub, start=None, end=None): return unicode(self, 'utf-8').find(unicode(sub, 'utf-8') + if isinstance(sub, str) else sub, start, end) + def ljust(self, width, fillchar=' '): return str.__new__(Utf8, unicode(self, 'utf-8').ljust(width, unicode(fillchar, 'utf-8') + if isinstance(fillchar, str) else fillchar).encode('utf-8')) + def partition(self, sep): + (head, sep, tail) = str.partition(self, Utf8(sep)) + return ( str.__new__(Utf8, head), + str.__new__(Utf8, sep), + str.__new__(Utf8, tail) ) + + def replace(self, old, new, count=-1): return str.__new__(Utf8, str.replace(self, Utf8(old), Utf8(new), count)) + def rfind(self, sub, start=None, end=None): return unicode(self, 'utf-8').rfind(unicode(sub, 'utf-8') + if isinstance(sub, str) else sub, start, end) + def rindex(self, string): return unicode(self, 'utf-8').rindex(string if isinstance(string,unicode) + else unicode(string, 'utf-8')) + def rjust(self, width, fillchar=' '): return str.__new__(Utf8, unicode(self, 'utf-8').rjust(width, unicode(fillchar, 'utf-8') + if isinstance(fillchar, str) else fillchar).encode('utf-8')) + def rpartition(self, sep): + (head, sep, tail) = str.rpartition(self, Utf8(sep)) + return ( str.__new__(Utf8, head), + str.__new__(Utf8, sep), + str.__new__(Utf8, tail) ) + def rsplit(self, sep=None, maxsplit=-1): return [str.__new__(Utf8, part) for part in str.rsplit(self, + None if sep is None else Utf8(sep), maxsplit)] + def split(self, sep=None, maxsplit=-1): return [str.__new__(Utf8, part) for part in str.split(self, + None if sep is None else Utf8(sep), maxsplit)] + def splitlines(self,keepends=False): return [str.__new__(Utf8, part) for part in str.splitlines(self,keepends)] + def startswith(self, prefix, start=0, end=None): + unistr = unicode(self, 'utf-8') + if isinstance(prefix, tuple): + prefix = tuple(unicode(s,'utf-8') if isinstance(s, str) else s for s in prefix) + elif isinstance(prefix, str): + prefix = unicode(prefix, 'utf-8') + return unistr.startswith(prefix, start, len(unistr) if end is None else end) + def translate(self, table, deletechars=''): + if isinstance(table, dict): + return str.__new__(Utf8, unicode(self, 'utf-8').translate(table).encode('utf-8')) + else: + return str.__new__(Utf8, str.translate(self, table, deletechars)) + def endswith(self, prefix, start=0, end=None): + unistr = unicode(self, 'utf-8') + if isinstance(prefix, tuple): + prefix = tuple(unicode(s,'utf-8') if isinstance(s, str) else s for s in prefix) + elif isinstance(prefix, str): + prefix = unicode(prefix, 'utf-8') + return unistr.endswith(prefix, start, len(unistr) if end is None else end) + if hasattr(str, 'format'): # Python 2.5 hasn't got str.format() method + def format(self, *args, **kwargs): + args = [unicode(s, 'utf-8') if isinstance(s, str) else s for s in args] + kwargs = dict((unicode(k, 'utf-8') if isinstance(k, str) else k, + unicode(v, 'utf-8') if isinstance(v, str) else v) + for k,v in kwargs.iteritems()) + return str.__new__(Utf8, unicode(self, 'utf-8'). + format(*args, **kwargs).encode('utf-8')) + def __mod__(self, right): + if isinstance(right, tuple): + right = tuple(unicode(v, 'utf-8') if isinstance(v, str) else v + for v in right) + elif isinstance(right, dict): + right = dict((unicode(k, 'utf-8') if isinstance(k, str) else k, + unicode(v, 'utf-8') if isinstance(v, str) else v) + for k,v in right.iteritems()) + elif isinstance(right, str): + right = unicode(right, 'utf-8') + return str.__new__(Utf8, unicode(self, 'utf-8').__mod__(right).encode('utf-8')) + def __ge__(self, string): return sort_key(self) >= sort_key(string) + def __gt__(self, string): return sort_key(self) > sort_key(string) + def __le__(self, string): return sort_key(self) <= sort_key(string) + def __lt__(self, string): return sort_key(self) < sort_key(string) + + +if __name__ == '__main__': + def doctests(): + u""" + doctests: + >>> test_unicode=u'ПРоба Є PRobe' + >>> test_unicode_word=u'ПРоба' + >>> test_number_str='12345' + >>> test_unicode + u'\\u041f\\u0420\\u043e\\u0431\\u0430 \\u0404 PRobe' + >>> print test_unicode + ПРоба Є PRobe + >>> test_word=test_unicode_word.encode('utf-8') + >>> test_str=test_unicode.encode('utf-8') + >>> s=Utf8(test_str) + >>> s + 'ПРоба Є PRobe' + >>> type(s) + + >>> s == test_str + True + >>> len(test_str) # wrong length of utf8-string! + 19 + >>> len(test_unicode) # RIGHT! + 13 + >>> len(s) # RIGHT! + 13 + >>> size(test_str) # size of utf-8 string (in bytes) == len(str) + 19 + >>> size(test_unicode) # size of unicode string in bytes (packed to utf-8 string) + 19 + >>> size(s) # size of utf-8 string in bytes + 19 + >>> try: # utf-8 is a multibyte string. Convert it to unicode for use with builtin ord() + ... __builtin__.ord('б') # ascii string + ... except Exception, e: + ... print 'Exception:', e + Exception: ord() expected a character, but string of length 2 found + >>> ord('б') # utf8.ord() is used(!!!) + 1073 + >>> ord(u'б') # utf8.ord() is used(!!!) + 1073 + >>> ord(s[3]) # utf8.ord() is used(!!!) + 1073 + >>> chr(ord(s[3])) # utf8.chr() and utf8.chr() is used(!!!) + 'б' + >>> type(chr(1073)) # utf8.chr() is used(!!!) + + >>> s=Utf8(test_unicode) + >>> s + 'ПРоба Є PRobe' + >>> s == test_str + True + >>> test_str == s + True + >>> s == test_unicode + True + >>> test_unicode == s + True + >>> print test_str.upper() # only ASCII characters uppered + ПРоба Є PROBE + >>> print test_unicode.upper() # unicode gives right result + ПРОБА Є PROBE + >>> s.upper() # utf8 class use unicode.upper() + 'ПРОБА Є PROBE' + >>> type(s.upper()) + + >>> s.lower() + 'проба є probe' + >>> type(s.lower()) + + >>> s.capitalize() + 'Проба є probe' + >>> type(s.capitalize()) + + >>> len(s) + 13 + >>> len(test_unicode) + 13 + >>> s+'. Probe is проба' + 'ПРоба Є PRobe. Probe is проба' + >>> type(s+'. Probe is проба') + + >>> s+u'. Probe is проба' + 'ПРоба Є PRobe. Probe is проба' + >>> type(s+u'. Probe is проба') + + >>> s+s + 'ПРоба Є PRobeПРоба Є PRobe' + >>> type(s+s) + + >>> a=s + >>> a+=s + >>> a+=test_unicode + >>> a+=test_str + >>> a + 'ПРоба Є PRobeПРоба Є PRobeПРоба Є PRobeПРоба Є PRobe' + >>> type(a) + + >>> s*3 + 'ПРоба Є PRobeПРоба Є PRobeПРоба Є PRobe' + >>> type(s*3) + + >>> a=Utf8("-проба-") + >>> a*=10 + >>> a + '-проба--проба--проба--проба--проба--проба--проба--проба--проба--проба-' + >>> type(a) + + >>> print "'"+test_str.center(17)+"'" # WRONG RESULT! + 'ПРоба Є PRobe' + >>> s.center(17) # RIGHT! + ' ПРоба Є PRobe ' + >>> type(s.center(17)) + + >>> (test_word+test_number_str).isalnum() # WRONG RESULT! non ASCII chars are detected as non alpha + False + >>> Utf8(test_word+test_number_str).isalnum() + True + >>> s.isalnum() + False + >>> test_word.isalpha() # WRONG RESULT! Non ASCII characters are detected as non alpha + False + >>> Utf8(test_word).isalpha() # RIGHT! + True + >>> s.lower().islower() + True + >>> s.upper().isupper() + True + >>> print test_str.zfill(17) # WRONG RESULT! + ПРоба Є PRobe + >>> s.zfill(17) # RIGHT! + '0000ПРоба Є PRobe' + >>> type(s.zfill(17)) + + >>> s.istitle() + False + >>> s.title().istitle() + True + >>> Utf8('1234').isdigit() + True + >>> Utf8(' \t').isspace() + True + >>> s.join('•|•') + '•ПРоба Є PRobe|ПРоба Є PRobe•' + >>> s.join((str('(utf8 тест1)'), unicode('(unicode тест2)','utf-8'), '(ascii test3)')) + '(utf8 тест1)ПРоба Є PRobe(unicode тест2)ПРоба Є PRobe(ascii test3)' + >>> type(s) + + >>> s==test_str + True + >>> s==test_unicode + True + >>> s.swapcase() + 'прОБА є prOBE' + >>> type(s.swapcase()) + + >>> s.truncate(10) + 'ПРоба Є...' + >>> s.truncate(20) + 'ПРоба Є PRobe' + >>> s.truncate(10, '•••') # utf-8 string as *dots* + 'ПРоба Є•••' + >>> s.truncate(10, u'®') # you can use unicode string as *dots* + 'ПРоба Є P®' + >>> type(s.truncate(10)) + + >>> Utf8(s.encode('koi8-u'), 'koi8-u') + 'ПРоба Є PRobe' + >>> s.decode() # convert utf-8 string to unicode + u'\\u041f\\u0420\\u043e\\u0431\\u0430 \\u0404 PRobe' + >>> a='про\\tba' + >>> str_tmp=a.expandtabs() + >>> utf8_tmp=Utf8(a).expandtabs() + >>> utf8_tmp.replace(' ','.') # RIGHT! (default tabsize is 8) + 'про.....ba' + >>> utf8_tmp.index('b') + 8 + >>> print "'"+str_tmp.replace(' ','.')+"'" # WRONG STRING LENGTH! + 'про..ba' + >>> str_tmp.index('b') # WRONG index of 'b' character + 8 + >>> print "'"+a.expandtabs(4).replace(' ','.')+"'" # WRONG RESULT! + 'про..ba' + >>> Utf8(a).expandtabs(4).replace(' ','.') # RIGHT! + 'про.ba' + >>> s.find('Є') + 6 + >>> s.find(u'Є') + 6 + >>> s.find(' ', 6) + 7 + >>> s.rfind(' ') + 7 + >>> s.partition('Є') + ('ПРоба ', 'Є', ' PRobe') + >>> s.partition(u'Є') + ('ПРоба ', 'Є', ' PRobe') + >>> (a,b,c) = s.partition('Є') + >>> type(a), type(b), type(c) + (, , ) + >>> s.partition(' ') + ('ПРоба', ' ', 'Є PRobe') + >>> s.rpartition(' ') + ('ПРоба Є', ' ', 'PRobe') + >>> s.index('Є') + 6 + >>> s.rindex(u'Є') + 6 + >>> s.index(' ') + 5 + >>> s.rindex(' ') + 7 + >>> a=Utf8('а б ц д е а б ц д е а\\tб ц д е') + >>> a.split() + ['а', 'б', 'ц', 'д', 'е', 'а', 'б', 'ц', 'д', 'е', 'а', 'б', 'ц', 'д', 'е'] + >>> a.rsplit() + ['а', 'б', 'ц', 'д', 'е', 'а', 'б', 'ц', 'д', 'е', 'а', 'б', 'ц', 'д', 'е'] + >>> a.expandtabs().split('б') + ['а ', ' ц д е а ', ' ц д е а ', ' ц д е'] + >>> a.expandtabs().rsplit('б') + ['а ', ' ц д е а ', ' ц д е а ', ' ц д е'] + >>> a.expandtabs().split(u'б', 1) + ['а ', ' ц д е а б ц д е а б ц д е'] + >>> a.expandtabs().rsplit(u'б', 1) + ['а б ц д е а б ц д е а ', ' ц д е'] + >>> a=Utf8("рядок1\\nрядок2\\nрядок3") + >>> a.splitlines() + ['рядок1', 'рядок2', 'рядок3'] + >>> a.splitlines(True) + ['рядок1\\n', 'рядок2\\n', 'рядок3'] + >>> s[6] + 'Є' + >>> s[0] + 'П' + >>> s[-1] + 'e' + >>> s[:10] + 'ПРоба Є PR' + >>> s[2:-2:2] + 'оаЄPo' + >>> s[::-1] + 'eboRP Є абоРП' + >>> s.startswith('ПР') + True + >>> s.startswith(('ПР', u'об'),0) + True + >>> s.startswith(u'об', 2, 4) + True + >>> s.endswith('be') + True + >>> s.endswith(('be', 'PR', u'Є')) + True + >>> s.endswith('PR', 8, 10) + True + >>> s.endswith('Є', -7, -6) + True + >>> s.count(' ') + 2 + >>> s.count(' ',6) + 1 + >>> s.count(u'Є') + 1 + >>> s.count('Є', 0, 5) + 0 + >>> Utf8("Parameters: '%(проба)s', %(probe)04d, %(проба2)s") % { u"проба": s, + ... "not used": "???", "probe": 2, "проба2": u"ПРоба Probe" } + "Parameters: 'ПРоба Є PRobe', 0002, ПРоба Probe" + >>> a=Utf8(u"Параметр: (%s)-(%s)-[%s]") + >>> a%=(s, s[::-1], 1000) + >>> a + 'Параметр: (ПРоба Є PRobe)-(eboRP Є абоРП)-[1000]' + >>> if hasattr(Utf8, 'format'): + ... Utf8("Проба <{0}>, {1}, {param1}, {param2}").format(s, u"中文字", + ... param1="барабан", param2=1000) == 'Проба <ПРоба Є PRobe>, 中文字, барабан, 1000' + ... else: # format() method is not used in python with version <2.6: + ... print True + True + >>> u'Б'>> 'Б'<'Ї' # WRONG ORDER! + False + >>> Utf8('Б')<'Ї' # RIGHT! + True + >>> u'д'>u'ґ' # WRONG ORDER! + False + >>> Utf8('д')>Utf8('ґ') # RIGHT! + True + >>> u'є'<=u'ж' # WRONG ORDER! + False + >>> Utf8('є')<=u'ж' # RIGHT! + True + >>> Utf8('є')<=u'є' + True + >>> u'Ї'>=u'И' # WRONG ORDER! + False + >>> Utf8(u'Ї') >= u'И' # RIGHT + True + >>> Utf8('Є') >= 'Є' + True + >>> a="яжертиуіопшщїасдфгґхйклчєзьцвбнмюЯЖЕРТИУІОПШЩЇАСДФГҐХЙКЛЧЗЬЦВБНМЮЄ" # str type + >>> b=u"яжертиуіопшщїасдфгґхйклчєзьцвбнмюЯЖЕРТИУІОПШЩЇАСДФГҐХЙКЛЧЗЬЦВБНМЮЄ" # unicode type + >>> c=Utf8("яжертиуіопшщїасдфгґхйклчєзьцвбнмюЯЖЕРТИУІОПШЩЇАСДФГҐХЙКЛЧЗЬЦВБНМЮЄ") # utf8 class + >>> result = "".join(sorted(a)) + >>> result[0:20] # result is not utf8 string, because bytes, not utf8-characters were sorted + '\\x80\\x81\\x82\\x83\\x84\\x84\\x85\\x86\\x86\\x87\\x87\\x88\\x89\\x8c\\x8e\\x8f\\x90\\x90\\x91\\x91' + >>> try: + ... unicode(result, 'utf-8') # try to convert result (utf-8?) to unicode + ... except Exception, e: + ... print 'Exception:', e + Exception: 'utf8' codec can't decode byte 0x80 in position 0: unexpected code byte + >>> try: # FAILED! (working with bytes, not with utf8-charactes) + ... "".join( sorted(a, key=sort_key) ) # utf8.sort_key may be used with utf8 or unicode strings only! + ... except Exception, e: + ... print 'Exception:', e + Exception: 'utf8' codec can't decode byte 0xd1 in position 0: unexpected end of data + >>> print "".join( sorted(Utf8(a))) # converting *a* to unicode or utf8-string gives us correct result + аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ + >>> print u"".join( sorted(b) ) # WRONG ORDER! Default sort key is used + ЄІЇАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЮЯабвгдежзийклмнопрстуфхцчшщьюяєіїҐґ + >>> print u"".join( sorted(b, key=sort_key) ) # RIGHT ORDER! utf8.sort_key is used + аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ + >>> print "".join( sorted(c) ) # RIGHT ORDER! Utf8 "rich comparison" methods are used + аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ + >>> print "".join( sorted(c, key=sort_key) ) # RIGHT ORDER! utf8.sort_key is used + аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ + >>> Utf8().join(sorted(c.decode(), key=sort_key)) # convert to unicode for better performance + 'аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ' + >>> for result in sorted(["Іа", "Астро", u"гала", Utf8("Гоша"), "Єва", "шовк", "аякс", "Їжа", + ... "ґанок", Utf8("Дар'я"), "білінг", "веб", u"Жужа", "проба", u"тест", + ... "абетка", "яблуко", "Юляся", "Київ", "лимонад", "ложка", "Матриця", + ... ], key=sort_key): + ... print result.ljust(20), type(result) + абетка + Астро + аякс + білінг + веб + гала + ґанок + Гоша + Дар'я + Єва + Жужа + Іа + Їжа + Київ + лимонад + ложка + Матриця + проба + тест + шовк + Юляся + яблуко + >>> a=Utf8("中文字") + >>> L=list(a) + >>> L + ['中', '文', '字'] + >>> a="".join(L) + >>> print a + 中文字 + >>> type(a) + + >>> a="中文字" # standard str type + >>> L=list(a) + >>> L + ['\\xe4', '\\xb8', '\\xad', '\\xe6', '\\x96', '\\x87', '\\xe5', '\\xad', '\\x97'] + >>> from string import maketrans + >>> str_tab=maketrans('PRobe','12345') + >>> unicode_tab={ord(u'П'):ord(u'Ж'), + ... ord(u'Р') : u'Ш', + ... ord(Utf8('о')) : None, # utf8.ord() is used + ... ord('б') : None, # -//-//- + ... ord(u'а') : u"中文字", + ... ord(u'Є') : Utf8('•').decode(), # only unicode type is supported + ... } + >>> s.translate(unicode_tab).translate(str_tab, deletechars=' ') + 'ЖШ中文字•12345' + """ + import sys + reload(sys) + sys.setdefaultencoding("UTF-8") + import doctest + print "DOCTESTS STARTED..." + doctest.testmod() + print "DOCTESTS FINISHED" + + doctests() +