From ebd863ab0ac0d80b73d4cd9c604a08624368bf24 Mon Sep 17 00:00:00 2001
From: mdipierro <massimo.dipierro@gmail.com>
Date: Mon, 16 Jul 2012 15:36:47 -0500
Subject: [PATCH] added utf8 and utf8 support in Markminn, thanks Vladyslav

---
 VERSION       |   2 +-
 gluon/utf8.py | 651 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 652 insertions(+), 1 deletion(-)
 create mode 100644 gluon/utf8.py

diff --git a/VERSION b/VERSION
index 4ab61e83..c60db99a 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-Version 2.00.0 (2012-07-16 14:25:16) dev
+Version 2.00.0 (2012-07-16 15:36:41) dev
diff --git a/gluon/utf8.py b/gluon/utf8.py
new file mode 100644
index 00000000..71e89b1b
--- /dev/null
+++ b/gluon/utf8.py
@@ -0,0 +1,651 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+This file is part of the web2py Web Framework
+Copyrighted by Massimo Di Pierro <mdipierro@cs.depaul.edu>
+License: LGPLv3 (http://www.gnu.org/licenses/lgpl.html)
+
+Created by Vladyslav Kozlovskyy (Ukraine) <dbdevelop©gmail.com>
+       for Web2py project
+
+Utilities and class for UTF8 strings managing
+===========================================
+"""
+import __builtin__
+__all__ = ['Utf8']
+
+repr_escape_tab={}
+for i in xrange(1,32): repr_escape_tab[i]=ur'\x%02i'%i
+repr_escape_tab[7]=u'\\a'
+repr_escape_tab[8]=u'\\b'
+repr_escape_tab[9]=u'\\t'
+repr_escape_tab[10]=u'\\n'
+repr_escape_tab[11]=u'\\v'
+repr_escape_tab[12]=u'\\f'
+repr_escape_tab[13]=u'\\r'
+repr_escape_tab[ord('\\')]=u'\\\\'
+repr_escape_tab2=repr_escape_tab.copy()
+repr_escape_tab2[ord('\'')]=u"\\'"
+
+def sort_key(s):
+    """ Unicode Collation Algorithm (UCA) (http://www.unicode.org/reports/tr10/)
+        is used for utf-8 and unicode strings sorting and for utf-8 strings
+        comparison
+
+        NOTE: pyuca is a very memory cost module! It loads the whole
+              "allkey.txt" file (~2mb!) into the memory. But this
+              functionality is needed only when sort_key() is called as a
+              part of sort() function or when Utf8 strings are compared.
+
+        So, it is a lazy "sort_key" function which (ONLY ONCE, ON ITS
+        FIRST CALL) imports pyuca and replaces itself with a real
+        sort_key() function
+    """
+    global sort_key
+    from contrib.pyuca import unicode_collator
+    unicode_sort_key = unicode_collator.sort_key
+    sort_key=lambda s: unicode_sort_key(
+                unicode(s, 'utf-8') if isinstance(s, str) else s)
+    return sort_key(s)
+
+
+def ord(char):
+    """ returns unicode id for utf8 or unicode *char* character
+
+        SUPPOSE that *char* is an utf-8 or unicode character only
+    """
+    if isinstance(char, unicode): return __builtin__.ord(char)
+    return __builtin__.ord(unicode(char, 'utf-8'))
+
+def chr(code):
+    """ return utf8-character with *code* unicode id """
+    return Utf8(unichr(code))
+
+def size(string):
+    """ return length of utf-8 string in bytes
+        NOTE! The length of correspondent utf-8
+              string is returned for unicode string
+    """
+    return Utf8(string).__size__()
+
+class Utf8(str):
+   """
+   Class for utf8 string storing and manipulations
+
+   The base presupposition of this class usage is:
+   "ALL strings in the application are either of
+   utf-8 or unicode type, even when simple str
+   type is used. UTF-8 is only a "packed" version
+   of unicode, so Utf-8 and unicode strings are
+   interchangeable."
+
+   CAUTION! This class is slower than str/unicode!
+   Do NOT use it inside intensive loops. Simply
+   decode string(s) to unicode before loop and
+   encode it back to utf-8 string(s) after
+   intensive calculation.
+
+   You can see the benefit of this class in doctests() below
+   """
+   def __new__(cls, content='', codepage='utf-8'):
+      if isinstance(content, unicode):
+         return str.__new__(cls, unicode.encode(content, 'utf-8'))
+      elif codepage in ('utf-8', 'utf8') or isinstance(content, cls):
+         return str.__new__(cls, content)
+      else:
+         return str.__new__(cls, unicode(content, codepage).encode('utf-8'))
+
+   def __repr__(self):
+       r''' # note that we use raw strings to avoid having to use double back slashes below
+       NOTE! This function is a clone of web2py:gluon.languages.utf_repl() function
+
+       utf8.__repr__() works same as str.repr() when processing ascii string
+       >>> repr(Utf8('abc')) == repr(Utf8("abc")) == repr('abc') == repr("abc") == "'abc'"
+       True
+       >>> repr(Utf8('a"b"c')) == repr('a"b"c') == '\'a"b"c\''
+       True
+       >>> repr(Utf8("a'b'c")) == repr("a'b'c") == '"a\'b\'c"'
+       True
+       >>> repr(Utf8('a\'b"c')) == repr('a\'b"c') == repr(Utf8("a'b\"c")) == repr("a'b\"c") == '\'a\\\'b"c\''
+       True
+       >>> repr(Utf8('a\r\nb')) == repr('a\r\nb') == "'a\\r\\nb'" # Test for \r, \n
+       True
+
+       Unlike str.repr(), Utf8.__repr__() remains utf8 content when processing utf8 string
+       >>> repr(Utf8('中文字')) == repr(Utf8("中文字")) == "'中文字'" != repr('中文字')
+       True
+       >>> repr(Utf8('中"文"字')) == "'中\"文\"字'" != repr('中"文"字')
+       True
+       >>> repr(Utf8("中'文'字")) == '"中\'文\'字"' != repr("中'文'字")
+       True
+       >>> repr(Utf8('中\'文"字')) == repr(Utf8("中'文\"字")) == '\'中\\\'文"字\'' != repr('中\'文"字') == repr("中'文\"字")
+       True
+       >>> repr(Utf8('中\r\n文')) == "'中\\r\\n文'" != repr('中\r\n文') # Test for \r, \n
+       True
+       '''
+       if str.find(self,"'") >= 0 and str.find(self,'"') < 0: # only single quote exists
+           return '"'+unicode(self, 'utf-8').translate(repr_escape_tab).encode('utf-8')+'"'
+       else:
+           return "'"+unicode(self, 'utf-8').translate(repr_escape_tab2).encode('utf-8')+"'"
+
+   def truncate(self, length, dots='...'):
+       """ returns string of length < *length* or truncate
+           string with adding *dots* suffix to the string's end
+
+       args:
+            length (int): max length of string
+            dots (str or unicode): string suffix, when string is cutted
+
+        returns:
+            (utf8-str): original or cutted string
+       """
+       text = unicode(self, 'utf-8')
+       dots = unicode(dots, 'utf-8') if isinstance(dots, str) else dots
+       if len(text) > length:
+           text = text[:length-len(dots)] + dots
+       return str.__new__(Utf8, text.encode('utf-8'))
+
+   def __size__(self):
+       """ length of utf-8 string in bytes """
+       return str.__len__(self)
+
+   def __contains__(self, other):      return str.__contains__(self, Utf8(other))
+   def __getitem__(self, index):       return str.__new__(Utf8, unicode(self, 'utf-8')[index].encode('utf-8'))
+   def __getslice__(self, begin, end): return str.__new__(Utf8, unicode(self, 'utf-8')[begin:end].encode('utf-8'))
+
+   def __add__(self, other):  return str.__new__(Utf8, str.__add__(self, unicode.encode(other, 'utf-8')
+                                                        if isinstance(other, unicode) else other))
+   def __len__(self):         return len(unicode(self, 'utf-8'))
+   def __mul__(self, integer):return str.__new__(Utf8, str.__mul__(self, integer))
+   def __eq__(self, string):  return str.__eq__(self, Utf8(string))
+   def __ne__(self, string):  return str.__ne__(self, Utf8(string))
+   def capitalize(self):      return str.__new__(Utf8, unicode(self, 'utf-8').capitalize().encode('utf-8'))
+   def center(self, length):  return str.__new__(Utf8, unicode(self, 'utf-8').center(length).encode('utf-8'))
+   def upper(self):           return str.__new__(Utf8, unicode(self, 'utf-8').upper().encode('utf-8'))
+   def lower(self):           return str.__new__(Utf8, unicode(self, 'utf-8').lower().encode('utf-8'))
+   def title(self):           return str.__new__(Utf8, unicode(self, 'utf-8').title().encode('utf-8'))
+   def index(self, string):   return unicode(self, 'utf-8').index(string if isinstance(string,unicode) else unicode(string, 'utf-8'))
+   def isalnum(self):         return unicode(self, 'utf-8').isalnum()
+   def isalpha(self):         return unicode(self, 'utf-8').isalpha()
+   def isdigit(self):         return unicode(self, 'utf-8').isdigit()
+   def islower(self):         return unicode(self, 'utf-8').islower()
+   def isspace(self):         return unicode(self, 'utf-8').isspace()
+   def istitle(self):         return unicode(self, 'utf-8').istitle()
+   def isupper(self):         return unicode(self, 'utf-8').isupper()
+   def zfill(self, length):   return str.__new__(Utf8, unicode(self, 'utf-8').zfill(length).encode('utf-8'))
+   def join(self, iter):      return str.__new__(Utf8, str.join(self, [Utf8(c) for c in
+                                                                       list(unicode(iter, 'utf-8') if
+                                                                            isinstance(iter, str) else
+                                                                            iter)]))
+   def lstrip(self, chars=None):   return str.__new__(Utf8, str.lstrip(self, None if chars is None else Utf8(chars)))
+   def rstrip(self, chars=None ):  return str.__new__(Utf8, str.rstrip(self, None if chars is None else Utf8(chars)))
+   def strip(self,  chars=None):   return str.__new__(Utf8, str.strip(self, None if chars is None else Utf8(chars)))
+   def swapcase(self):             return str.__new__(Utf8, unicode(self, 'utf-8').swapcase().encode('utf-8'))
+
+   def count(self, sub, start=0, end=None):
+       unistr = unicode(self, 'utf-8')
+       return unistr.count(unicode(sub, 'utf-8') if isinstance(sub, str) else sub,
+                              start, len(unistr) if end is None else end)
+   def decode(self, encoding='utf-8', errors='strict'): return str.decode(self, encoding, errors)
+   def encode(self, encoding, errors='strict'): return unicode(self, 'utf-8').encode(encoding, errors)
+   def expandtabs(self, tabsize=8): return str.__new__(Utf8, unicode(self, 'utf-8').expandtabs(tabsize).encode('utf-8'))
+   def find(self, sub, start=None, end=None): return unicode(self, 'utf-8').find(unicode(sub, 'utf-8')
+                                                     if isinstance(sub, str) else sub, start, end)
+   def ljust(self, width, fillchar=' '): return str.__new__(Utf8, unicode(self, 'utf-8').ljust(width, unicode(fillchar, 'utf-8')
+                                                              if isinstance(fillchar, str) else fillchar).encode('utf-8'))
+   def partition(self, sep):
+       (head, sep, tail) = str.partition(self, Utf8(sep))
+       return ( str.__new__(Utf8, head),
+                str.__new__(Utf8, sep),
+                str.__new__(Utf8, tail) )
+
+   def replace(self, old, new, count=-1): return str.__new__(Utf8, str.replace(self, Utf8(old), Utf8(new), count))
+   def rfind(self, sub, start=None, end=None): return unicode(self, 'utf-8').rfind(unicode(sub, 'utf-8')
+                                                              if isinstance(sub, str) else sub, start, end)
+   def rindex(self, string):   return unicode(self, 'utf-8').rindex(string if isinstance(string,unicode)
+                                                                         else unicode(string, 'utf-8'))
+   def rjust(self, width, fillchar=' '): return str.__new__(Utf8, unicode(self, 'utf-8').rjust(width, unicode(fillchar, 'utf-8')
+                                                              if isinstance(fillchar, str) else fillchar).encode('utf-8'))
+   def rpartition(self, sep):
+       (head, sep, tail) = str.rpartition(self, Utf8(sep))
+       return ( str.__new__(Utf8, head),
+                str.__new__(Utf8, sep),
+                str.__new__(Utf8, tail) )
+   def rsplit(self, sep=None, maxsplit=-1): return [str.__new__(Utf8, part) for part in str.rsplit(self,
+                                                                 None if sep is None else Utf8(sep), maxsplit)]
+   def split(self, sep=None, maxsplit=-1): return [str.__new__(Utf8, part) for part in str.split(self,
+                                                                 None if sep is None else Utf8(sep), maxsplit)]
+   def splitlines(self,keepends=False): return [str.__new__(Utf8, part) for part in str.splitlines(self,keepends)]
+   def startswith(self, prefix, start=0, end=None):
+       unistr = unicode(self, 'utf-8')
+       if isinstance(prefix, tuple):
+           prefix = tuple(unicode(s,'utf-8') if isinstance(s, str) else s for s in prefix)
+       elif isinstance(prefix, str):
+           prefix = unicode(prefix, 'utf-8')
+       return unistr.startswith(prefix, start, len(unistr) if end is None else end)
+   def translate(self, table, deletechars=''):
+       if isinstance(table, dict):
+           return str.__new__(Utf8, unicode(self, 'utf-8').translate(table).encode('utf-8'))
+       else:
+           return str.__new__(Utf8, str.translate(self, table, deletechars))
+   def endswith(self, prefix, start=0, end=None):
+       unistr = unicode(self, 'utf-8')
+       if isinstance(prefix, tuple):
+           prefix = tuple(unicode(s,'utf-8') if isinstance(s, str) else s for s in prefix)
+       elif isinstance(prefix, str):
+           prefix = unicode(prefix, 'utf-8')
+       return unistr.endswith(prefix, start, len(unistr) if end is None else end)
+   if hasattr(str, 'format'): # Python 2.5 hasn't got str.format() method
+       def format(self, *args, **kwargs):
+           args = [unicode(s, 'utf-8') if isinstance(s, str) else s for s in args]
+           kwargs = dict((unicode(k, 'utf-8') if isinstance(k, str) else k,
+                          unicode(v, 'utf-8') if isinstance(v, str) else v)
+                             for k,v in kwargs.iteritems())
+           return str.__new__(Utf8, unicode(self, 'utf-8').
+                                         format(*args, **kwargs).encode('utf-8'))
+   def __mod__(self, right):
+       if isinstance(right, tuple):
+           right = tuple(unicode(v, 'utf-8') if isinstance(v, str) else v
+                             for v in right)
+       elif isinstance(right, dict):
+           right = dict((unicode(k, 'utf-8') if isinstance(k, str) else k,
+                          unicode(v, 'utf-8') if isinstance(v, str) else v)
+                             for k,v in right.iteritems())
+       elif isinstance(right, str):
+           right = unicode(right, 'utf-8')
+       return str.__new__(Utf8, unicode(self, 'utf-8').__mod__(right).encode('utf-8'))
+   def __ge__(self, string): return sort_key(self) >= sort_key(string)
+   def __gt__(self, string): return sort_key(self) >  sort_key(string)
+   def __le__(self, string): return sort_key(self) <= sort_key(string)
+   def __lt__(self, string): return sort_key(self) <  sort_key(string)
+
+
+if __name__ == '__main__':
+    def doctests():
+       u"""
+       doctests:
+       >>> test_unicode=u'ПРоба Є PRobe'
+       >>> test_unicode_word=u'ПРоба'
+       >>> test_number_str='12345'
+       >>> test_unicode
+       u'\\u041f\\u0420\\u043e\\u0431\\u0430 \\u0404 PRobe'
+       >>> print test_unicode
+       ПРоба Є PRobe
+       >>> test_word=test_unicode_word.encode('utf-8')
+       >>> test_str=test_unicode.encode('utf-8')
+       >>> s=Utf8(test_str)
+       >>> s
+       'ПРоба Є PRobe'
+       >>> type(s)
+       <class '__main__.Utf8'>
+       >>> s == test_str
+       True
+       >>> len(test_str) # wrong length of utf8-string!
+       19
+       >>> len(test_unicode) # RIGHT!
+       13
+       >>> len(s) # RIGHT!
+       13
+       >>> size(test_str) # size of utf-8 string (in bytes) == len(str)
+       19
+       >>> size(test_unicode) # size of unicode string in bytes (packed to utf-8 string)
+       19
+       >>> size(s) # size of utf-8 string in bytes
+       19
+       >>> try: # utf-8 is a multibyte string. Convert it to unicode for use with builtin ord()
+       ...     __builtin__.ord('б')  #  ascii string
+       ... except Exception, e:
+       ...     print 'Exception:', e
+       Exception: ord() expected a character, but string of length 2 found
+       >>> ord('б') # utf8.ord() is used(!!!)
+       1073
+       >>> ord(u'б') # utf8.ord() is used(!!!)
+       1073
+       >>> ord(s[3])  # utf8.ord() is used(!!!)
+       1073
+       >>> chr(ord(s[3])) # utf8.chr() and utf8.chr() is used(!!!)
+       'б'
+       >>> type(chr(1073))  # utf8.chr() is used(!!!)
+       <class '__main__.Utf8'>
+       >>> s=Utf8(test_unicode)
+       >>> s
+       'ПРоба Є PRobe'
+       >>> s == test_str
+       True
+       >>> test_str == s
+       True
+       >>> s == test_unicode
+       True
+       >>> test_unicode == s
+       True
+       >>> print test_str.upper() # only ASCII characters uppered
+       ПРоба Є PROBE
+       >>> print test_unicode.upper() # unicode gives right result
+       ПРОБА Є PROBE
+       >>> s.upper() # utf8 class use unicode.upper()
+       'ПРОБА Є PROBE'
+       >>> type(s.upper())
+       <class '__main__.Utf8'>
+       >>> s.lower()
+       'проба є probe'
+       >>> type(s.lower())
+       <class '__main__.Utf8'>
+       >>> s.capitalize()
+       'Проба є probe'
+       >>> type(s.capitalize())
+       <class '__main__.Utf8'>
+       >>> len(s)
+       13
+       >>> len(test_unicode)
+       13
+       >>> s+'. Probe is проба'
+       'ПРоба Є PRobe. Probe is проба'
+       >>> type(s+'. Probe is проба')
+       <class '__main__.Utf8'>
+       >>> s+u'. Probe is проба'
+       'ПРоба Є PRobe. Probe is проба'
+       >>> type(s+u'. Probe is проба')
+       <class '__main__.Utf8'>
+       >>> s+s
+       'ПРоба Є PRobeПРоба Є PRobe'
+       >>> type(s+s)
+       <class '__main__.Utf8'>
+       >>> a=s
+       >>> a+=s
+       >>> a+=test_unicode
+       >>> a+=test_str
+       >>> a
+       'ПРоба Є PRobeПРоба Є PRobeПРоба Є PRobeПРоба Є PRobe'
+       >>> type(a)
+       <class '__main__.Utf8'>
+       >>> s*3
+       'ПРоба Є PRobeПРоба Є PRobeПРоба Є PRobe'
+       >>> type(s*3)
+       <class '__main__.Utf8'>
+       >>> a=Utf8("-проба-")
+       >>> a*=10
+       >>> a
+       '-проба--проба--проба--проба--проба--проба--проба--проба--проба--проба-'
+       >>> type(a)
+       <class '__main__.Utf8'>
+       >>> print "'"+test_str.center(17)+"'" # WRONG RESULT!
+       'ПРоба Є PRobe'
+       >>> s.center(17) # RIGHT!
+       '  ПРоба Є PRobe  '
+       >>> type(s.center(17))
+       <class '__main__.Utf8'>
+       >>> (test_word+test_number_str).isalnum() # WRONG RESULT! non ASCII chars are detected as non alpha
+       False
+       >>> Utf8(test_word+test_number_str).isalnum()
+       True
+       >>> s.isalnum()
+       False
+       >>> test_word.isalpha() # WRONG RESULT! Non ASCII characters are detected as non alpha
+       False
+       >>> Utf8(test_word).isalpha() # RIGHT!
+       True
+       >>> s.lower().islower()
+       True
+       >>> s.upper().isupper()
+       True
+       >>> print test_str.zfill(17) # WRONG RESULT!
+       ПРоба Є PRobe
+       >>> s.zfill(17) # RIGHT!
+       '0000ПРоба Є PRobe'
+       >>> type(s.zfill(17))
+       <class '__main__.Utf8'>
+       >>> s.istitle()
+       False
+       >>> s.title().istitle()
+       True
+       >>> Utf8('1234').isdigit()
+       True
+       >>> Utf8(' \t').isspace()
+       True
+       >>> s.join('•|•')
+       '•ПРоба Є PRobe|ПРоба Є PRobe•'
+       >>> s.join((str('(utf8 тест1)'), unicode('(unicode тест2)','utf-8'), '(ascii test3)'))
+       '(utf8 тест1)ПРоба Є PRobe(unicode тест2)ПРоба Є PRobe(ascii test3)'
+       >>> type(s)
+       <class '__main__.Utf8'>
+       >>> s==test_str
+       True
+       >>> s==test_unicode
+       True
+       >>> s.swapcase()
+       'прОБА є prOBE'
+       >>> type(s.swapcase())
+       <class '__main__.Utf8'>
+       >>> s.truncate(10)
+       'ПРоба Є...'
+       >>> s.truncate(20)
+       'ПРоба Є PRobe'
+       >>> s.truncate(10, '•••') # utf-8 string as *dots*
+       'ПРоба Є•••'
+       >>> s.truncate(10, u'®') # you can use unicode string as *dots*
+       'ПРоба Є P®'
+       >>> type(s.truncate(10))
+       <class '__main__.Utf8'>
+       >>> Utf8(s.encode('koi8-u'), 'koi8-u')
+       'ПРоба Є PRobe'
+       >>> s.decode() # convert utf-8 string to unicode
+       u'\\u041f\\u0420\\u043e\\u0431\\u0430 \\u0404 PRobe'
+       >>> a='про\\tba'
+       >>> str_tmp=a.expandtabs()
+       >>> utf8_tmp=Utf8(a).expandtabs()
+       >>> utf8_tmp.replace(' ','.') # RIGHT! (default tabsize is 8)
+       'про.....ba'
+       >>> utf8_tmp.index('b')
+       8
+       >>> print "'"+str_tmp.replace(' ','.')+"'" # WRONG STRING LENGTH!
+       'про..ba'
+       >>> str_tmp.index('b') # WRONG index of 'b' character
+       8
+       >>> print "'"+a.expandtabs(4).replace(' ','.')+"'" # WRONG RESULT!
+       'про..ba'
+       >>> Utf8(a).expandtabs(4).replace(' ','.') # RIGHT!
+       'про.ba'
+       >>> s.find('Є')
+       6
+       >>> s.find(u'Є')
+       6
+       >>> s.find(' ', 6)
+       7
+       >>> s.rfind(' ')
+       7
+       >>> s.partition('Є')
+       ('ПРоба ', 'Є', ' PRobe')
+       >>> s.partition(u'Є')
+       ('ПРоба ', 'Є', ' PRobe')
+       >>> (a,b,c) = s.partition('Є')
+       >>> type(a), type(b), type(c)
+       (<class '__main__.Utf8'>, <class '__main__.Utf8'>, <class '__main__.Utf8'>)
+       >>> s.partition(' ')
+       ('ПРоба', ' ', 'Є PRobe')
+       >>> s.rpartition(' ')
+       ('ПРоба Є', ' ', 'PRobe')
+       >>> s.index('Є')
+       6
+       >>> s.rindex(u'Є')
+       6
+       >>> s.index(' ')
+       5
+       >>> s.rindex(' ')
+       7
+       >>> a=Utf8('а б ц д е а б ц д е а\\tб ц д е')
+       >>> a.split()
+       ['а', 'б', 'ц', 'д', 'е', 'а', 'б', 'ц', 'д', 'е', 'а', 'б', 'ц', 'д', 'е']
+       >>> a.rsplit()
+       ['а', 'б', 'ц', 'д', 'е', 'а', 'б', 'ц', 'д', 'е', 'а', 'б', 'ц', 'д', 'е']
+       >>> a.expandtabs().split('б')
+       ['а ', ' ц д е а ', ' ц д е а   ', ' ц д е']
+       >>> a.expandtabs().rsplit('б')
+       ['а ', ' ц д е а ', ' ц д е а   ', ' ц д е']
+       >>> a.expandtabs().split(u'б', 1)
+       ['а ', ' ц д е а б ц д е а   б ц д е']
+       >>> a.expandtabs().rsplit(u'б', 1)
+       ['а б ц д е а б ц д е а   ', ' ц д е']
+       >>> a=Utf8("рядок1\\nрядок2\\nрядок3")
+       >>> a.splitlines()
+       ['рядок1', 'рядок2', 'рядок3']
+       >>> a.splitlines(True)
+       ['рядок1\\n', 'рядок2\\n', 'рядок3']
+       >>> s[6]
+       'Є'
+       >>> s[0]
+       'П'
+       >>> s[-1]
+       'e'
+       >>> s[:10]
+       'ПРоба Є PR'
+       >>> s[2:-2:2]
+       'оаЄPo'
+       >>> s[::-1]
+       'eboRP Є абоРП'
+       >>> s.startswith('ПР')
+       True
+       >>> s.startswith(('ПР', u'об'),0)
+       True
+       >>> s.startswith(u'об', 2, 4)
+       True
+       >>> s.endswith('be')
+       True
+       >>> s.endswith(('be', 'PR', u'Є'))
+       True
+       >>> s.endswith('PR', 8, 10)
+       True
+       >>> s.endswith('Є', -7, -6)
+       True
+       >>> s.count(' ')
+       2
+       >>> s.count(' ',6)
+       1
+       >>> s.count(u'Є')
+       1
+       >>> s.count('Є', 0, 5)
+       0
+       >>> Utf8("Parameters: '%(проба)s', %(probe)04d, %(проба2)s") % { u"проба": s,
+       ...      "not used": "???", "probe":  2, "проба2": u"ПРоба Probe" }
+       "Parameters: 'ПРоба Є PRobe', 0002, ПРоба Probe"
+       >>> a=Utf8(u"Параметр: (%s)-(%s)-[%s]")
+       >>> a%=(s, s[::-1], 1000)
+       >>> a
+       'Параметр: (ПРоба Є PRobe)-(eboRP Є абоРП)-[1000]'
+       >>> if hasattr(Utf8,  'format'):
+       ...     Utf8("Проба <{0}>, {1}, {param1}, {param2}").format(s, u"中文字",
+       ...           param1="барабан", param2=1000) == 'Проба <ПРоба Є PRobe>, 中文字, барабан, 1000'
+       ... else: # format() method is not used in python with version <2.6:
+       ...     print True
+       True
+       >>> u'Б'<u'Ї' # WRONG ORDER!
+       False
+       >>> 'Б'<'Ї' # WRONG ORDER!
+       False
+       >>> Utf8('Б')<'Ї' # RIGHT!
+       True
+       >>> u'д'>u'ґ' # WRONG ORDER!
+       False
+       >>> Utf8('д')>Utf8('ґ') # RIGHT!
+       True
+       >>> u'є'<=u'ж' # WRONG ORDER!
+       False
+       >>> Utf8('є')<=u'ж' # RIGHT!
+       True
+       >>> Utf8('є')<=u'є'
+       True
+       >>> u'Ї'>=u'И' # WRONG ORDER!
+       False
+       >>> Utf8(u'Ї') >= u'И' # RIGHT
+       True
+       >>> Utf8('Є') >= 'Є'
+       True
+       >>> a="яжертиуіопшщїасдфгґхйклчєзьцвбнмюЯЖЕРТИУІОПШЩЇАСДФГҐХЙКЛЧЗЬЦВБНМЮЄ"  # str type
+       >>> b=u"яжертиуіопшщїасдфгґхйклчєзьцвбнмюЯЖЕРТИУІОПШЩЇАСДФГҐХЙКЛЧЗЬЦВБНМЮЄ" # unicode type
+       >>> c=Utf8("яжертиуіопшщїасдфгґхйклчєзьцвбнмюЯЖЕРТИУІОПШЩЇАСДФГҐХЙКЛЧЗЬЦВБНМЮЄ") # utf8 class
+       >>> result = "".join(sorted(a))
+       >>> result[0:20] # result is not utf8 string, because bytes, not utf8-characters were sorted
+       '\\x80\\x81\\x82\\x83\\x84\\x84\\x85\\x86\\x86\\x87\\x87\\x88\\x89\\x8c\\x8e\\x8f\\x90\\x90\\x91\\x91'
+       >>> try:
+       ...   unicode(result, 'utf-8') # try to convert result (utf-8?) to unicode
+       ... except Exception, e:
+       ...    print 'Exception:', e
+       Exception: 'utf8' codec can't decode byte 0x80 in position 0: unexpected code byte
+       >>> try: # FAILED! (working with bytes, not with utf8-charactes)
+       ...    "".join( sorted(a, key=sort_key) ) # utf8.sort_key may be used with utf8 or unicode strings only!
+       ... except Exception, e:
+       ...    print 'Exception:', e
+       Exception: 'utf8' codec can't decode byte 0xd1 in position 0: unexpected end of data
+       >>> print "".join( sorted(Utf8(a))) # converting *a* to unicode or utf8-string gives us correct result
+       аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ
+       >>> print u"".join( sorted(b) ) # WRONG ORDER! Default sort key is used
+       ЄІЇАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЮЯабвгдежзийклмнопрстуфхцчшщьюяєіїҐґ
+       >>> print u"".join( sorted(b, key=sort_key) ) # RIGHT ORDER! utf8.sort_key is used
+       аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ
+       >>> print "".join( sorted(c) ) # RIGHT ORDER! Utf8 "rich comparison" methods are used
+       аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ
+       >>> print "".join( sorted(c, key=sort_key) ) # RIGHT ORDER! utf8.sort_key is used
+       аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ
+       >>> Utf8().join(sorted(c.decode(), key=sort_key)) # convert to unicode for better performance
+       'аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ'
+       >>> for result in sorted(["Іа", "Астро", u"гала", Utf8("Гоша"), "Єва", "шовк", "аякс", "Їжа",
+       ...                       "ґанок", Utf8("Дар'я"), "білінг", "веб", u"Жужа", "проба", u"тест",
+       ...                       "абетка", "яблуко", "Юляся", "Київ", "лимонад", "ложка", "Матриця",
+       ...                      ], key=sort_key):
+       ...     print result.ljust(20), type(result)
+       абетка         <type 'str'>
+       Астро           <type 'str'>
+       аякс             <type 'str'>
+       білінг         <type 'str'>
+       веб               <type 'str'>
+       гала                 <type 'unicode'>
+       ґанок           <type 'str'>
+       Гоша                 <class '__main__.Utf8'>
+       Дар'я                <class '__main__.Utf8'>
+       Єва               <type 'str'>
+       Жужа                 <type 'unicode'>
+       Іа                 <type 'str'>
+       Їжа               <type 'str'>
+       Київ             <type 'str'>
+       лимонад       <type 'str'>
+       ложка           <type 'str'>
+       Матриця       <type 'str'>
+       проба           <type 'str'>
+       тест                 <type 'unicode'>
+       шовк             <type 'str'>
+       Юляся           <type 'str'>
+       яблуко         <type 'str'>
+       >>> a=Utf8("中文字")
+       >>> L=list(a)
+       >>> L
+       ['中', '文', '字']
+       >>> a="".join(L)
+       >>> print a
+       中文字
+       >>> type(a)
+       <type 'str'>
+       >>> a="中文字"  # standard str type
+       >>> L=list(a)
+       >>> L
+       ['\\xe4', '\\xb8', '\\xad', '\\xe6', '\\x96', '\\x87', '\\xe5', '\\xad', '\\x97']
+       >>> from string import maketrans
+       >>> str_tab=maketrans('PRobe','12345')
+       >>> unicode_tab={ord(u'П'):ord(u'Ж'),
+       ...              ord(u'Р')      : u'Ш',
+       ...              ord(Utf8('о')) : None,  # utf8.ord() is used
+       ...              ord('б')       : None,  # -//-//-
+       ...              ord(u'а')      : u"中文字",
+       ...              ord(u'Є')      : Utf8('•').decode(), # only unicode type is supported
+       ...             }
+       >>> s.translate(unicode_tab).translate(str_tab, deletechars=' ')
+       'ЖШ中文字•12345'
+       """
+       import sys
+       reload(sys)
+       sys.setdefaultencoding("UTF-8")
+       import doctest
+       print "DOCTESTS STARTED..."
+       doctest.testmod()
+       print "DOCTESTS FINISHED"
+
+    doctests()
+