added new slugify with unidecode

author Boris Bobrov <breton@cynicmansion.ru>

Tue, 11 Mar 2014 10:50:01 +0000 (15:50 +0500)

committer Christopher Allan Webber <cwebber@dustycloud.org>

Tue, 11 Mar 2014 16:01:04 +0000 (11:01 -0500)
author Boris Bobrov <breton@cynicmansion.ru>
Tue, 11 Mar 2014 10:50:01 +0000 (15:50 +0500)
committer Christopher Allan Webber <cwebber@dustycloud.org>
Tue, 11 Mar 2014 16:01:04 +0000 (11:01 -0500)
diff --git a/mediagoblin/tests/test_util.py b/mediagoblin/tests/test_util.py

index bc14f528e7c02499a6146255c526ed8e900cf4fe..9d9b1c161448b291b26b8854da9d0412eb1521da 100644 (file)
--- a/mediagoblin/tests/test_util.py
+++ b/mediagoblin/tests/test_util.py
@@ -77,6 +77,12 @@ def test_slugify():
      assert url.slugify(u'a w@lk in the park?') == u'a-w-lk-in-the-park'
      assert url.slugify(u'a walk in the par\u0107') == u'a-walk-in-the-parc'
      assert url.slugify(u'\u00E0\u0042\u00E7\u010F\u00EB\u0066') == u'abcdef'
+    # Russian
+    assert url.slugify(u'\u043f\u0440\u043e\u0433\u0443\u043b\u043a\u0430 '
+            u'\u0432 \u043f\u0430\u0440\u043a\u0435') == u'progulka-v-parke'
+    # Korean
+    assert (url.slugify(u'\uacf5\uc6d0\uc5d0\uc11c \uc0b0\ucc45') ==
+            u'gongweoneseo-sancaeg')
  
  def test_locale_to_lower_upper():
      """
diff --git a/mediagoblin/tools/url.py b/mediagoblin/tools/url.py

index d9179f9e2eab18a1f168160df890402d29e99d34..657c0373d4397d66873ddfcf0bd1458ae6b3fac7 100644 (file)
--- a/mediagoblin/tools/url.py
+++ b/mediagoblin/tools/url.py
@@ -15,15 +15,7 @@
  # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  
  import re
-# This import *is* used; see word.encode('tranlit/long') below.
-from unicodedata import normalize
-
-try:
-    import translitcodec
-    USING_TRANSLITCODEC = True
-except ImportError:
-    USING_TRANSLITCODEC = False
-
+from unidecode import unidecode
  
  _punct_re = re.compile(r'[\t !"#:$%&\'()*\-/<=>?@\[\\\]^_`{|},.]+')
  
@@ -34,11 +26,5 @@ def slugify(text, delim=u'-'):
      """
      result = []
      for word in _punct_re.split(text.lower()):
-        if USING_TRANSLITCODEC:
-            word = word.encode('translit/long')
-        else:
-            word = normalize('NFKD', word).encode('ascii', 'ignore')
-
-        if word:
-            result.append(word)
+        result.extend(unidecode(word).split())
      return unicode(delim.join(result))
diff --git a/setup.py b/setup.py

index 7abd896c26e181ecdcda30c68b5d74710d9237b0..a3cc055c0d00e375a000ea8ed1bb1d9c609cf8a3 100644 (file)
--- a/setup.py
+++ b/setup.py
@@ -65,6 +65,7 @@ try:
          'pytz',
          'six',
          'oauthlib==0.5.0',
+        'unidecode',
  
          ## Annoying.  Please remove once we can!  We only indirectly
          ## use pbr, and currently it breaks things, presumably till
author	Boris Bobrov <breton@cynicmansion.ru>
	Tue, 11 Mar 2014 10:50:01 +0000 (15:50 +0500)
committer	Christopher Allan Webber <cwebber@dustycloud.org>
	Tue, 11 Mar 2014 16:01:04 +0000 (11:01 -0500)
mediagoblin/tests/test_util.py		patch \| blob \| blame \| history
mediagoblin/tools/url.py		patch \| blob \| blame \| history
setup.py		patch \| blob \| blame \| history