Merge pull request #315 from winniehell/patch_urlify_translitcodec

Use translitcodec for urlify
2014-03-02 17:24:07 -06:00
parent 4a65e773ce 15cedc0269
commit cb75a96e2c
6 changed files with 2548 additions and 2 deletions
@@ -0,0 +1,9 @@
+translitcodec was originally written by Jason Kirtland in 2008.
+
+Contributors are:
+
+- Jason Kirtland <jek@discorporate.us>
+- Craig Dennis <craig@idealist.org>
+
+The translitcodec source distribution includes the 'transtab' package
+by Markus Kuhn <mkuhn@acm.org>.
@@ -0,0 +1,31 @@
+=====================
+translitcodec Changes
+=====================
+
+0.3
+---
+
+Released on February 14, 2011
+
+- Fixes to the transtab table rebuilding tool.
+
+- Added translitcodec.__version__
+
+0.2
+---
+
+Released on January 27, 2011
+
+- Resolves issue of "TypeError: character mapping must return integer,
+  None or unicode" when a blank value (eg: \N{ZERO WIDTH SPACE} \u200B)
+  was encoded.  Unicode blanks are now returned.
+
+- Characters in the ASCII range are no longer included in the translation
+  tables.
+
+0.1
+---
+
+Released on December 28, 2008
+
+- Initial packaged release.
@@ -0,0 +1,20 @@
+Copyright (c) 2008 Jason Kirtland <jek at discorporate us>
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,44 @@
+-*- coding: utf-8 -*-
+
+Unicode to 8-bit charset transliteration codec.
+
+This package contains codecs for transliterating ISO 10646 texts into
+best-effort representations using smaller coded character sets (ASCII,
+ISO 8859, etc.).  The translation tables used by the codecs are from
+the ``transtab`` collection by Markus Kuhn.
+
+Three types of transliterating codecs are provided:
+
+  "long", using as many characters as needed to make a natural
+   replacement.  For example, \u00e4 LATIN SMALL LETTER A WITH
+   DIAERESIS ``ä`` will be replaced with ``ae``.
+
+  "short", using the minimum number of characters to make a
+  replacement.  For example, \u00e4 LATIN SMALL LETTER A WITH
+  DIAERESIS ``ä`` will be replaced with ``a``.
+
+  "one", only performing single character replacements.  Characters
+  that can not be transliterated with a single character are passed
+  through unchanged. For example, \u2639 WHITE FROWNING FACE ``☹``
+  will be passed through unchanged.
+
+Using the codecs is simple::
+
+  >>> import translitcodec
+  >>> u'fácil € ☺'.encode('translit/long')
+  u'facil EUR :-)'
+  >>> u'fácil € ☺'.encode('translit/short')
+  u'facil E :-)'
+
+The codecs return Unicode by default.  To receive a bytestring back,
+either chain the output of encode() to another codec, or append the
+name of the desired byte encoding to the codec name::
+
+  >>> u'fácil € ☺'.encode('translit/one').encode('ascii', 'replace')
+  'facil E ?'
+  >>> u'fácil € ☺'.encode('translit/one/ascii', 'replace')
+  'facil E ?'
+
+The package also supplies a 'transliterate' codec, an alias for
+'translit/long'.
+
@@ -17,7 +17,6 @@ import cgi
 import urllib
 import struct
 import decimal
-import unicodedata
 from cStringIO import StringIO
 from gluon.utils import simple_hash, web2py_uuid, DIGEST_ALG_BY_SIZE
 from gluon.dal import FieldVirtual, FieldMethod
@@ -2519,7 +2518,7 @@ def urlify(s, maxlen=80, keep_underscores=False):
    if isinstance(s, str):
        s = s.decode('utf-8')             # to unicode
    s = s.lower()                         # to lowercase
-    s = unicodedata.normalize('NFKD', s)  # normalize eg è => e, ñ => n
+    s = s.encode('translit/long')         # replace special characters
    s = s.encode('ascii', 'ignore')       # encode as ASCII
    s = re.sub('&\w+?;', '', s)           # strip html entities
    if keep_underscores: